annotate dataset.py @ 266:6e69fb91f3c0

initial commit of amat
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 04 Jun 2008 17:49:09 -0400
parents 19b14afe04b7
children 3f1cd8897fda
rev   line source
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
1
12
ff4e551490f1 Added LookupList type in lookup_list.py and used it to keep order
bengioy@esprit.iro.umontreal.ca
parents: 11
diff changeset
2 from lookup_list import LookupList
ff4e551490f1 Added LookupList type in lookup_list.py and used it to keep order
bengioy@esprit.iro.umontreal.ca
parents: 11
diff changeset
3 Example = LookupList
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
4 from misc import unique_elements_list_intersection
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
5 from string import join
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
6 from sys import maxint
171
895b4b60f5e8 bugfix. Otherwise the example was writed over and not a new one was returned
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 167
diff changeset
7 import numpy, copy
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
8
166
ee11ed427ba8 Created exceptions.py
Joseph Turian <turian@gmail.com>
parents: 163
diff changeset
9 from exceptions import *
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
10
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
11 class AttributesHolder(object):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
12 def __init__(self): pass
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
13
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
14 def attributeNames(self):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
15 raise AbstractFunction()
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
16
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
17 def setAttributes(self,attribute_names,attribute_values,make_copies=False):
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
18 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
19 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1.
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
20 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
21 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ):
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
22 attribute_values = [attribute_values]
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
23 if make_copies:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
24 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
25 self.__setattr__(name,copy.deepcopy(value))
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
26 else:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
27 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
28 self.__setattr__(name,value)
193
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
29
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
30 def getAttributes(self,attribute_names=None, return_copy=False):
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
31 """
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
32 Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes.
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
33 """
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
34 if attribute_names is None:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
35 attribute_names = self.attributeNames()
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
36 if return_copy:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
37 return [copy.copy(self.__getattribute__(name)) for name in attribute_names]
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
38 else:
cb6b945acf5a Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 188
diff changeset
39 return [self.__getattribute__(name) for name in attribute_names]
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
40
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
41
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
42 class DataSet(AttributesHolder):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
43 """A virtual base class for datasets.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
44
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
45 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
46 with learning algorithms (for training and testing them): rows/records are called examples, and
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
47 columns/attributes are called fields. The field value for a particular example can be an arbitrary
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
48 python object, which depends on the particular dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
49
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
50 We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
51 should return sys.maxint).
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
52
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
53 A DataSet is a generator of iterators; these iterators can run through the
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
54 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
55 or known length, so this class can be used to interface to a 'stream' which
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
56 feeds on-line learning (however, as noted below, some operations are not
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
57 feasible or not recommended on streams).
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
58
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
59 To iterate over examples, there are several possibilities:
90
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
60 - for example in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
61 - for val1,val2,... in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
62 - for example in dataset(field1, field2,field3, ...):
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
63 - for val1,val2,val3 in dataset(field1, field2,field3):
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
64 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
65 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
66 Each of these is documented below. All of these iterators are expected
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
67 to provide, in addition to the usual 'next()' method, a 'next_index()' method
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
68 which returns a non-negative integer pointing to the position of the next
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
69 example that will be returned by 'next()' (or of the first example in the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
70 next minibatch returned). This is important because these iterators
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
71 can wrap around the dataset in order to do multiple passes through it,
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
72 in possibly unregular ways if the minibatch size is not a divisor of the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
73 dataset length.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
74
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
75 To iterate over fields, one can do
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
76 - for field in dataset.fields():
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
77 for field_value in field: # iterate over the values associated to that field for all the dataset examples
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
78 - for field in dataset(field1,field2,...).fields() to select a subset of fields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
79 - for field in dataset.fields(field1,field2,...) to select a subset of fields
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
80 and each of these fields is iterable over the examples:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
81 - for field_examples in dataset.fields():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
82 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
83 ...
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
84 but when the dataset is a stream (unbounded length), it is not recommended to do
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
85 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
86 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
132
f6505ec32dc3 Updated documentation slightly
Joseph Turian <turian@gmail.com>
parents: 128
diff changeset
87 The result of fields() is a L{DataSetFields} object, which iterates over fields,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
88 and whose elements are iterable over examples. A DataSetFields object can
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
89 be turned back into a DataSet with its examples() method::
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
90 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
91 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
92
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
93 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
94
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
95 Note: The content of a field can be of any type. Field values can also be 'missing'
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
96 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array)
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
97 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value.
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
98 What about non-numeric values? None.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
99
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
100 Dataset elements can be indexed and sub-datasets (with a subset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
101 of examples) can be extracted. These operations are not supported
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
102 by default in the case of streams.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
103
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
104 - dataset[:n] returns a dataset with the n first examples.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
105
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
106 - dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
107
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
108 - dataset[i] returns an Example.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
109
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
110 - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
111
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
112 - dataset.<property> returns the value of a property associated with
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
113 the name <property>. The following properties should be supported:
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
114 - 'description': a textual description or name for the dataset
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
115 - 'fieldtypes': a list of types (one per field)
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
116 A DataSet may have other attributes that it makes visible to other objects. These are
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
117 used to store information that is not example-wise but global to the dataset.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
118 The list of names of these attributes is given by the attribute_names() method.
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
119
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
120 Datasets can be concatenated either vertically (increasing the length) or
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
121 horizontally (augmenting the set of fields), if they are compatible, using
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
122 the following operations (with the same basic semantics as numpy.hstack
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
123 and numpy.vstack):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
124
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
125 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
126
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
127 creates a new dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
128 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
129
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
130 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
131
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
132 creates a new dataset that concatenates the examples from the argument datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
133 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
134 works if they all have the same fields.
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
135
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
136 According to the same logic, and viewing a DataSetFields object associated to
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
137 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
138 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
139 examples.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
140
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
141 A dataset can hold arbitrary key-value pairs that may be used to access meta-data
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
142 or other properties of the dataset or associated with the dataset or the result
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
143 of a computation stored in a dataset. These can be accessed through the [key] syntax
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
144 when key is a string (or more specifically, neither an integer, a slice, nor a list).
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
145
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
146 A DataSet sub-class should always redefine the following methods:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
147 - __len__ if it is not a stream
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
148 - fieldNames
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
149 - minibatches_nowrap (called by DataSet.minibatches())
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
150 - valuesHStack
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
151 - valuesVStack
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
152 For efficiency of implementation, a sub-class might also want to redefine
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
153 - hasFields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
154 - __getitem__ may not be feasible with some streams
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
155 - __iter__
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
156 A sub-class should also append attributes to self._attribute_names
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
157 (the default value returned by attributeNames()).
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
158 By convention, attributes not in attributeNames() should have a name
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
159 starting with an underscore.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
160 @todo enforce/test that convention!
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
161
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
162 """
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
163
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
164 if 0:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
165 # removed by James June 4... these aren't used anywhere according to
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
166 # grep
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
167 numpy_vstack = lambda fieldname,values: numpy.vstack(values)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
168 numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
169
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
170 def __init__(self,description=None,fieldtypes=None):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
171 if description is None:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
172 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
173 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
174 self.description=description
60
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 57
diff changeset
175 self.fieldtypes=fieldtypes
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
176 self._attribute_names = ["description"]
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
177 if fieldtypes:
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
178 self._attribute_names.append("fieldtypes")
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
179
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
180 def attributeNames(self): return self._attribute_names
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
181
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
182 class MinibatchToSingleExampleIterator(object):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
183 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
184 Converts the result of minibatch iterator with minibatch_size==1 into
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
185 single-example values in the result. Therefore the result of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
186 iterating on the dataset itself gives a sequence of single examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
187 (whereas the result of iterating over minibatches gives in each
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
188 Example field an iterable object over the individual examples in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
189 the minibatch).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
190 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
191 def __init__(self, minibatch_iterator):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
192 self.minibatch_iterator = minibatch_iterator
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
193 self.minibatch = None
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
194 def __iter__(self): #makes for loop work
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
195 return self
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
196 def next(self):
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
197 size1_minibatch = self.minibatch_iterator.next()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
198 if not self.minibatch:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
199 self.minibatch = Example(size1_minibatch.keys(),[value[0] for value in size1_minibatch.values()])
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
200 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
201 self.minibatch._values = [value[0] for value in size1_minibatch.values()]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
202 return self.minibatch
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
203
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
204 def next_index(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
205 return self.minibatch_iterator.next_index()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
206
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
207 def __iter__(self):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
208 """Supports the syntax "for i in dataset: ..."
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
209
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
210 Using this syntax, "i" will be an Example instance (or equivalent) with
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
211 all the fields of DataSet self. Every field of "i" will give access to
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
212 a field of a single example. Fields should be accessible via
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
213 i["fielname"] or i[3] (in the order defined by the elements of the
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
214 Example returned by this iterator), but the derived class is free
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
215 to accept any type of identifier, and add extra functionality to the iterator.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
216
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
217 The default implementation calls the minibatches iterator and extracts the first example of each field.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
218 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
219 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
220
188
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
221 def __contains__(self, fieldname):
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
222 return (fieldname in self.fieldNames()) \
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
223 or (fieldname in self.attributeNames())
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
224
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
225 class MinibatchWrapAroundIterator(object):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
226 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
227 An iterator for minibatches that handles the case where we need to wrap around the
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
228 dataset because n_batches*minibatch_size > len(dataset). It is constructed from
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
229 a dataset that provides a minibatch iterator that does not need to handle that problem.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
230 This class is a utility for dataset subclass writers, so that they do not have to handle
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
231 this issue multiple times, nor check that fieldnames are valid, nor handle the
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
232 empty fieldnames (meaning 'use all the fields').
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
233 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
234 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
235 self.dataset=dataset
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
236 self.fieldnames=fieldnames
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
237 self.minibatch_size=minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
238 self.n_batches=n_batches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
239 self.n_batches_done=0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
240 self.next_row=offset
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
241 self.offset=offset
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
242 self.L=len(dataset)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
243 assert offset+minibatch_size<=self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
244 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
245 if n_batches is not None:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
246 ds_nbatches = min(n_batches,ds_nbatches)
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
247 if fieldnames:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
248 assert dataset.hasFields(*fieldnames)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
249 else:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
250 self.fieldnames=dataset.fieldNames()
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
251 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
252 ds_nbatches,self.next_row)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
253
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
254 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
255 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
256
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
257 def next_index(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
258 return self.next_row
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
259
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
260 def next(self):
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
261 if self.n_batches and self.n_batches_done==self.n_batches:
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
262 raise StopIteration
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
263 elif not self.n_batches and self.next_row ==self.L:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
264 raise StopIteration
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
265 upper = self.next_row+self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
266 if upper <=self.L:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
267 minibatch = self.iterator.next()
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
268 else:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
269 if not self.n_batches:
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
270 upper=min(upper, self.L)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
271 # if their is not a fixed number of batch, we continue to the end of the dataset.
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
272 # this can create a minibatch that is smaller then the minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
273 assert (self.L-self.next_row)<=self.minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
274 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
275 else:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
276 # we must concatenate (vstack) the bottom and top parts of our minibatch
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
277 # first get the beginning of our minibatch (top of dataset)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
278 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
279 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
280
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
281 blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
282 for name in self.fieldnames]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
283 print type(self.dataset), blah
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
284 minibatch = Example(self.fieldnames,blah)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
285 self.next_row=upper
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
286 self.n_batches_done+=1
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
287 if upper >= self.L and self.n_batches:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
288 self.next_row -= self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
289 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
290 if self.n_batches is not None:
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
291 ds_nbatches = min(self.n_batches,ds_nbatches)
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
292 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
293 ds_nbatches,self.next_row)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
294 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack,
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
295 self.dataset.valuesHStack),
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
296 minibatch.keys())
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
297
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
298
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
299 minibatches_fieldnames = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
300 minibatches_minibatch_size = 1
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
301 minibatches_n_batches = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
302 def minibatches(self,
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
303 fieldnames = minibatches_fieldnames,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
304 minibatch_size = minibatches_minibatch_size,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
305 n_batches = minibatches_n_batches,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
306 offset = 0):
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
307 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
308 Return an iterator that supports three forms of syntax:
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
309
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
310 for i in dataset.minibatches(None,**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
311
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
312 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
313
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
314 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
315
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
316 Using the first two syntaxes, "i" will be an indexable object, such as a list,
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
317 tuple, or Example instance. In both cases, i[k] is a list-like container
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
318 of a batch of current examples. In the second case, i[0] is
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
319 list-like container of the f1 field of a batch current examples, i[1] is
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
320 a list-like container of the f2 field, etc.
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
321
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
322 Using the first syntax, all the fields will be returned in "i".
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
323 Using the third syntax, i1, i2, i3 will be list-like containers of the
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
324 f1, f2, and f3 fields of a batch of examples on each loop iteration.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
325
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
326 The minibatches iterator is expected to return upon each call to next()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
327 a DataSetFields object, which is a LookupList (indexed by the field names) whose
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
328 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
329 a sub-dataset that can be used to iterate over the individual examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
330 in the minibatch. Hence a minibatch can be converted back to a regular
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
331 dataset or its fields can be looked at individually (and possibly iterated over).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
332
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
333 PARAMETERS
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
334 - fieldnames (list of any type, default None):
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
335 The loop variables i1, i2, i3 (in the example above) should contain the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
336 f1, f2, and f3 fields of the current batch of examples. If None, the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
337 derived class can choose a default, e.g. all fields.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
338
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
339 - minibatch_size (integer, default 1)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
340 On every iteration, the variables i1, i2, i3 will have
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
341 exactly minibatch_size elements. e.g. len(i1) == minibatch_size
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
342
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
343 - n_batches (integer, default None)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
344 The iterator will loop exactly this many times, and then stop. If None,
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
345 the derived class can choose a default. If (-1), then the returned
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
346 iterator should support looping indefinitely.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
347
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
348 - offset (integer, default 0)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
349 The iterator will start at example 'offset' in the dataset, rather than the default.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
350
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
351 Note: A list-like container is something like a tuple, list, numpy.ndarray or
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
352 any other object that supports integer indexing and slicing.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
353
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
354 """
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
355 return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
356
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
357 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
358 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
359 This is the minibatches iterator generator that sub-classes must define.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
360 It does not need to worry about wrapping around multiple times across the dataset,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
361 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
362 The next() method of the returned iterator does not even need to worry about
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
363 the termination condition (as StopIteration will be raised by DataSet.minibatches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
364 before an improper call to minibatches_nowrap's next() is made).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
365 That next() method can assert that its next row will always be within [0,len(dataset)).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
366 The iterator returned by minibatches_nowrap does not need to implement
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
367 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
368 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
369 raise AbstractFunction()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
370
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
371 def __len__(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
372 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
373 len(dataset) returns the number of examples in the dataset.
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
374 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
375 Sub-classes which implement finite-length datasets should redefine this method.
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
376 Some methods only make sense for finite-length datasets.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
377 """
123
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 110
diff changeset
378 return maxint
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
379
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
380 def is_unbounded(self):
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
381 """
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
382 Tests whether a dataset is unbounded (e.g. a stream).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
383 """
123
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 110
diff changeset
384 return len(self)==maxint
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
385
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
386 def hasFields(self,*fieldnames):
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
387 """
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
388 Return true if the given field name (or field names, if multiple arguments are
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
389 given) is recognized by the DataSet (i.e. can be used as a field name in one
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
390 of the iterators).
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
391
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
392 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
393 method. Many datasets may store their field names in a dictionary, which would allow more efficiency.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
394 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
395 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
396
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
397 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
398 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
399 Return the list of field names that are supported by the iterators,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
400 and for which hasFields(fieldname) would return True.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
401 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
402 raise AbstractFunction()
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
403
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
404 def __call__(self,*fieldnames):
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
405 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
406 Return a dataset that sees only the fields whose name are specified.
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
407 """
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
408 assert self.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
409 return self.fields(*fieldnames).examples()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
410
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
411 def fields(self,*fieldnames):
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
412 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
413 Return a DataSetFields object associated with this dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
414 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
415 return DataSetFields(self,fieldnames)
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
416
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
417 def __getitem__(self,i):
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
418 """
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
419 dataset[i] returns the (i+1)-th example of the dataset.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
420 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
421 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
422 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
423 dataset['key'] returns a property associated with the given 'key' string.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
424 If 'key' is a fieldname, then the VStacked field values (iterable over
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
425 field values) for that field is returned. Other keys may be supported
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
426 by different dataset subclasses. The following key names are encouraged:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
427 - 'description': a textual description or name for the dataset
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
428 - '<fieldname>.type': a type name or value for a given <fieldname>
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
429
39
c682c6e9bf93 Minor edits
bengioy@esprit.iro.umontreal.ca
parents: 38
diff changeset
430 Note that some stream datasets may be unable to implement random access, i.e.
c682c6e9bf93 Minor edits
bengioy@esprit.iro.umontreal.ca
parents: 38
diff changeset
431 arbitrary slicing/indexing
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
432 because they can only iterate through examples one or a minibatch at a time
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
433 and do not actually store or keep past (or future) examples.
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
434
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
435 The default implementation of getitem uses the minibatches iterator
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
436 to obtain one example, one slice, or a list of examples. It may not
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
437 always be the most efficient way to obtain the result, especially if
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
438 the data are actually stored in a memory array.
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
439 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
440 # check for an index
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
441 if type(i) is int:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
442 return DataSet.MinibatchToSingleExampleIterator(
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
443 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
444 rows=None
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
445 # or a slice
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
446 if type(i) is slice:
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
447 #print 'i=',i
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
448 if not i.start: i=slice(0,i.stop,i.step)
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
449 if not i.stop: i=slice(i.start,len(self),i.step)
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
450 if not i.step: i=slice(i.start,i.stop,1)
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
451 if i.step is 1:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
452 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
453 rows = range(i.start,i.stop,i.step)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
454 # or a list of indices
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
455 elif type(i) is list:
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
456 rows = i
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
457 if rows is not None:
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
458 examples = [self[row] for row in rows]
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
459 fields_values = zip(*examples)
45
a5c70dc42972 Test functions for dataset.py
bengioy@grenat.iro.umontreal.ca
parents: 44
diff changeset
460 return MinibatchDataSet(
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
461 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
462 for fieldname,field_values
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
463 in zip(self.fieldNames(),fields_values)]),
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
464 self.valuesVStack,self.valuesHStack)
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
465
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
466 raise TypeError(i)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
467 if 0:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
468 # else check for a fieldname
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
469 #after talk with Yoshua June 4, this is disabled.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
470 if self.hasFields(i):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
471 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
472 # else we are trying to access a property of the dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
473 assert i in self.__dict__ # else it means we are trying to access a non-existing property
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
474 return self.__dict__[i]
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
475
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
476 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
477 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
478 Return a value that corresponds to concatenating (horizontally) several field values.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
479 This can be useful to merge some fields. The implementation of this operation is likely
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
480 to involve a copy of the original values. When the values are numpy arrays, the
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
481 result should be numpy.hstack(values). If it makes sense, this operation should
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
482 work as well when each value corresponds to multiple examples in a minibatch
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
483 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
484 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
485 The default is to use numpy.hstack for numpy.ndarray values, and a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
486 pointing to the original values for other data types.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
487 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
488 all_numpy=True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
489 for value in fieldvalues:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
490 if not type(value) is numpy.ndarray:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
491 all_numpy=False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
492 if all_numpy:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
493 return numpy.hstack(fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
494 # the default implementation of horizontal stacking is to put values in a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
495 return fieldvalues
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
496
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
497
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
498 def valuesVStack(self,fieldname,values):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
499 """
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
500 @param fieldname: the name of the field from which the values were taken
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
501 @type fieldname: any type
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
502
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
503 @param values: bits near the beginning or end of the dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
504 @type values: list of minibatches (returned by minibatch_nowrap)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
505
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
506 @return: the concatenation (stacking) of the values
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
507 @rtype: something suitable as a minibatch field
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
508
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
509 """
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
510 rval = []
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
511 for sub_batch in values:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
512 rval.extend(sub_batch)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
513 return rval
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
514
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
515 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
516 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
517 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
518 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
519 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
520 return HStackedDataSet([self,other])
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
521
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
522 def __and__(self,other):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
523 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
524 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
525 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
526 works if they all have the same fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
527 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
528 return VStackedDataSet([self,other])
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
529
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
530 def hstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
531 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
532 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
533 which is a dataset whose fields list is the concatenation of the fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
534 of the individual datasets.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
535 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
536 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
537 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
538 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
539 return HStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
540
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
541 def vstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
542 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
543 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
544 which is a dataset which iterates first over the examples of dataset1, then
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
545 over those of dataset2, etc.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
546 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
547 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
548 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
549 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
550 return VStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
551
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
552 class FieldsSubsetDataSet(DataSet):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
553 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
554 A sub-class of L{DataSet} that selects a subset of the fields.
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
555 """
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
556 def __init__(self,src,fieldnames):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
557 self.src=src
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
558 self.fieldnames=fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
559 assert src.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
560 self.valuesHStack = src.valuesHStack
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
561 self.valuesVStack = src.valuesVStack
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
562
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
563 def __len__(self): return len(self.src)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
564
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
565 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
566 return self.fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
567
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
568 def __iter__(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
569 class FieldsSubsetIterator(object):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
570 def __init__(self,ds):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
571 self.ds=ds
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
572 self.src_iter=ds.src.__iter__()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
573 self.example=None
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
574 def __iter__(self): return self
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
575 def next(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
576 complete_example = self.src_iter.next()
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
577 if self.example:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
578 self.example._values=[complete_example[field]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
579 for field in self.ds.fieldnames]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
580 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
581 self.example=Example(self.ds.fieldnames,
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
582 [complete_example[field] for field in self.ds.fieldnames])
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
583 return self.example
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
584 return FieldsSubsetIterator(self)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
585
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
586 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
587 assert self.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
588 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
589 def __getitem__(self,i):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
590 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
591
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
592
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
593 class DataSetFields(LookupList):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
594 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
595 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
596 DataSetFields iterates over fields (like columns of a matrix), and can be understood
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
597 as a transpose of the associated dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
598
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
599 To iterate over fields, one can do
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
600 * for fields in dataset.fields()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
601 * for fields in dataset(field1,field2,...).fields() to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
602 * for fields in dataset.fields(field1,field2,...) to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
603 and each of these fields is iterable over the examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
604 * for field_examples in dataset.fields():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
605 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
606 ...
241
ddb88a8e9fd2 If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents: 231
diff changeset
607 but when the dataset is a stream (unbounded length), it is not recommended to do
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
608 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
609 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
610 The result of fields() is a DataSetFields object, which iterates over fields,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
611 and whose elements are iterable over examples. A DataSetFields object can
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
612 be turned back into a DataSet with its examples() method:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
613 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
614 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
615
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
616 DataSetFields can be concatenated vertically or horizontally. To be consistent with
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
617 the syntax used for DataSets, the | concatenates the fields and the & concatenates
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
618 the examples.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
619 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
620 def __init__(self,dataset,fieldnames):
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
621 original_dataset=dataset
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
622 if not fieldnames:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
623 fieldnames=dataset.fieldNames()
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
624 elif not fieldnames==dataset.fieldNames():
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
625 dataset = FieldsSubsetDataSet(dataset,fieldnames)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
626 assert dataset.hasFields(*fieldnames)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
627 self.dataset=dataset
66
dde1fb1b63ba fixed test and removed print
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 65
diff changeset
628
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
629 if isinstance(dataset,MinibatchDataSet):
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
630 LookupList.__init__(self,fieldnames,list(dataset._fields))
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
631 elif isinstance(original_dataset,MinibatchDataSet):
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
632 LookupList.__init__(self,fieldnames,
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
633 [original_dataset._fields[field]
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
634 for field in fieldnames])
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
635 else:
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
636 minibatch_iterator = dataset.minibatches(fieldnames,
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
637 minibatch_size=len(dataset),
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
638 n_batches=1)
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
639 minibatch=minibatch_iterator.next()
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
640 LookupList.__init__(self,fieldnames,minibatch)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
641
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
642 def examples(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
643 return self.dataset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
644
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
645 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
646 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
647 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
648 of the list of examples of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
649 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
650 return (self.examples() + other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
651
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
652 def __and__(self,other):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
653 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
654 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
655 of the fields of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
656 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
657 return (self.examples() | other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
658
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
659
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
660 class MinibatchDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
661 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
662 Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
663 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
664 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
665 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
666 values_hstack=DataSet().valuesHStack):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
667 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
668 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
669 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
670 semantics as the DataSet methods of the same name (but without the self argument).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
671 """
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
672
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
673 self._fields=fields_lookuplist
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
674 assert len(fields_lookuplist)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
675 self.length=len(fields_lookuplist[0])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
676 for field in fields_lookuplist[1:]:
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
677 if self.length != len(field) :
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
678 print 'self.length = ',self.length
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
679 print 'len(field) = ', len(field)
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
680 print 'self._fields.keys() = ', self._fields.keys()
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
681 print 'field=',field
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
682 assert self.length==len(field)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
683 self.values_vstack=values_vstack
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
684 self.values_hstack=values_hstack
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
685
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
686 def __len__(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
687 return self.length
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
688
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
689 def __getitem__(self,i):
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
690 if type(i) in (slice,list):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
691 return DataSetFields(MinibatchDataSet(
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
692 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
693 if type(i) is int:
85
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 83
diff changeset
694 return Example(self._fields.keys(),[field[i] for field in self._fields])
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
695 if self.hasFields(i):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
696 return self._fields[i]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
697 assert i in self.__dict__ # else it means we are trying to access a non-existing property
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
698 return self.__dict__[i]
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
699
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
700 def fieldNames(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
701 return self._fields.keys()
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
702
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
703 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
704 for fieldname in fieldnames:
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
705 if fieldname not in self._fields.keys():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
706 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
707 return True
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
708
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
709 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
710 #@TODO bug somewhere here, fieldnames doesnt seem to be well handled
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
711 class Iterator(object):
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
712 def __init__(self,ds,fieldnames):
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
713 # tbm: added two next lines to handle fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
714 if fieldnames is None: fieldnames = ds._fields.keys()
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
715 self.fieldnames = fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
716
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
717 self.ds=ds
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
718 self.next_example=offset
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
719 assert minibatch_size > 0
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
720 if offset+minibatch_size > ds.length:
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
721 raise NotImplementedError()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
722 def __iter__(self):
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
723 return self
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
724 def next(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
725 upper = self.next_example+minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
726 assert upper<=self.ds.length
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
727 #minibatch = Example(self.ds._fields.keys(),
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
728 # [field[self.next_example:upper]
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
729 # for field in self.ds._fields])
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
730 # tbm: modif to use fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
731 values = []
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
732 for f in self.fieldnames :
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
733 #print 'we have field',f,'in fieldnames'
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
734 values.append( self.ds._fields[f][self.next_example:upper] )
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
735 minibatch = Example(self.fieldnames,values)
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
736 #print minibatch
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
737 self.next_example+=minibatch_size
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
738 return minibatch
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
739
223
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
740 # tbm: added fieldnames to handle subset of fieldnames
517364d48ae0 should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 218
diff changeset
741 return Iterator(self,fieldnames)
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
742
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
743 def valuesVStack(self,fieldname,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
744 return self.values_vstack(fieldname,fieldvalues)
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
745
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
746 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
747 return self.values_hstack(fieldnames,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
748
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
749 class HStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
750 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
751 A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
752 i.e. whose list of fields is the concatenation of their lists of fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
753
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
754 If a field name is found in more than one of the datasets, then either an error is
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
755 raised or the fields are renamed (either by prefixing the __name__ attribute
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
756 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
757
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
758 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
759 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
760 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
761 DataSet.__init__(self,description,field_types)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
762 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
763 self.accept_nonunique_names=accept_nonunique_names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
764 self.fieldname2dataset={}
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
765
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
766 def rename_field(fieldname,dataset,i):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
767 if hasattr(dataset,"__name__"):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
768 return dataset.__name__ + "." + fieldname
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
769 return fieldname+"."+str(i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
770
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
771 # make sure all datasets have the same length and unique field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
772 self.length=None
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
773 names_to_change=[]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
774 for i in xrange(len(datasets)):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
775 dataset = datasets[i]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
776 length=len(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
777 if self.length:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
778 assert self.length==length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
779 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
780 self.length=length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
781 for fieldname in dataset.fieldNames():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
782 if fieldname in self.fieldname2dataset: # name conflict!
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
783 if accept_nonunique_names:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
784 fieldname=rename_field(fieldname,dataset,i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
785 names2change.append((fieldname,i))
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
786 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
787 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
788 self.fieldname2dataset[fieldname]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
789 for fieldname,i in names_to_change:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
790 del self.fieldname2dataset[fieldname]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
791 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
792
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
793 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
794 for fieldname in fieldnames:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
795 if not fieldname in self.fieldname2dataset:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
796 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
797 return True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
798
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
799 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
800 return self.fieldname2dataset.keys()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
801
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
802 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
803
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
804 class HStackedIterator(object):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
805 def __init__(self,hsds,iterators):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
806 self.hsds=hsds
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
807 self.iterators=iterators
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
808 def __iter__(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
809 return self
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
810 def next(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
811 # concatenate all the fields of the minibatches
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
812 l=LookupList()
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
813 for iter in self.iterators:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
814 l.append_lookuplist(iter.next())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
815 return l
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
816
125
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 123
diff changeset
817 assert self.hasFields(*fieldnames)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
818 # find out which underlying datasets are necessary to service the required fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
819 # and construct corresponding minibatch iterators
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
820 if fieldnames and fieldnames!=self.fieldNames():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
821 datasets=set([])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
822 fields_in_dataset=dict([(dataset,[]) for dataset in datasets])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
823 for fieldname in fieldnames:
136
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 125
diff changeset
824 dataset=self.datasets[self.fieldname2dataset[fieldname]]
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
825 datasets.add(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
826 fields_in_dataset[dataset].append(fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
827 datasets=list(datasets)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
828 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
829 for dataset in datasets]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
830 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
831 datasets=self.datasets
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
832 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
833 return HStackedIterator(self,iterators)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
834
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
835
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
836 def valuesVStack(self,fieldname,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
837 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
838
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
839 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
840 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
841 We will use the sub-dataset associated with the first fieldname in the fieldnames list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
842 to do the work, hoping that it can cope with the other values (i.e. won't care
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
843 about the incompatible fieldnames). Hence this heuristic will always work if
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
844 all the fieldnames are of the same sub-dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
845 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
846 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
847
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
848 class VStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
849 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
850 A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
851 in the order provided. This clearly assumes that they all have the same field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
852 and all (except possibly the last one) are of finite length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
853
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
854 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
855 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
856 def __init__(self,datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
857 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
858 self.length=0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
859 self.index2dataset={}
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
860 assert len(datasets)>0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
861 fieldnames = datasets[-1].fieldNames()
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
862 self.datasets_start_row=[]
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
863 # We use this map from row index to dataset index for constant-time random access of examples,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
864 # to avoid having to search for the appropriate dataset each time and slice is asked for.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
865 for dataset,k in enumerate(datasets[0:-1]):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
866 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
867 L=len(dataset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
868 for i in xrange(L):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
869 self.index2dataset[self.length+i]=k
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
870 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
871 self.length+=L
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
872 assert dataset.fieldNames()==fieldnames
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
873 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
874 self.length+=len(datasets[-1])
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
875 # If length is very large, we should use a more memory-efficient mechanism
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
876 # that does not store all indices
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
877 if self.length>1000000:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
878 # 1 million entries would require about 60 meg for the index2dataset map
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
879 # TODO
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
880 print "A more efficient mechanism for index2dataset should be implemented"
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
881
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
882 def __len__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
883 return self.length
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
884
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
885 def fieldNames(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
886 return self.datasets[0].fieldNames()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
887
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
888 def hasFields(self,*fieldnames):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
889 return self.datasets[0].hasFields(*fieldnames)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
890
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
891 def locate_row(self,row):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
892 """Return (dataset_index, row_within_dataset) for global row number"""
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
893 dataset_index = self.index2dataset[row]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
894 row_within_dataset = self.datasets_start_row[dataset_index]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
895 return dataset_index, row_within_dataset
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
896
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
897 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
898
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
899 class VStackedIterator(object):
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
900 def __init__(self,vsds):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
901 self.vsds=vsds
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
902 self.next_row=offset
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
903 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
904 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
905 self.next_iterator(vsds.datasets[0],offset,n_batches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
906
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
907 def next_iterator(self,dataset,starting_offset,batches_left):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
908 L=len(dataset)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
909 ds_nbatches = (L-starting_offset)/minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
910 if batches_left is not None:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
911 ds_nbatches = max(batches_left,ds_nbatches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
912 if minibatch_size>L:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
913 ds_minibatch_size=L
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
914 n_left_in_mb=minibatch_size-L
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
915 ds_nbatches=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
916 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
917 n_left_in_mb=0
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
918 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
919 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
920
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
921 def move_to_next_dataset(self):
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
922 if self.n_left_at_the_end_of_ds>0:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
923 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
924 self.next_iterator(vsds.datasets[self.next_dataset_index],
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
925 self.n_left_at_the_end_of_ds,1)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
926 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
927 self.next_dataset_index +=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
928 if self.next_dataset_index==len(self.vsds.datasets):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
929 self.next_dataset_index = 0
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
930 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
931 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
932
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
933 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
934 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
935
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
936 def next(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
937 dataset=self.vsds.datasets[self.next_dataset_index]
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
938 mb = self.next_iterator.next()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
939 if self.n_left_in_mb:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
940 extra_mb = []
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
941 while self.n_left_in_mb>0:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
942 self.move_to_next_dataset()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
943 extra_mb.append(self.next_iterator.next())
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
944 mb = Example(fieldnames,
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
945 [dataset.valuesVStack(name,
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
946 [mb[name]]+[b[name] for b in extra_mb])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
947 for name in fieldnames])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
948
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
949 self.next_row+=minibatch_size
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
950 self.next_dataset_row+=minibatch_size
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
951 if self.next_row+minibatch_size>len(dataset):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
952 self.move_to_next_dataset()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
953 return examples
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
954 return VStackedIterator(self)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
955
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
956 class ArrayFieldsDataSet(DataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
957 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
958 Virtual super-class of datasets whose field values are numpy array,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
959 thus defining valuesHStack and valuesVStack for sub-classes.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
960 """
257
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
961 def __init__(self, description=None, field_types=None):
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
962 DataSet.__init__(self, description, field_types)
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
963 def valuesHStack(self, fieldnames, fieldvalues):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
964 """Concatenate field values horizontally, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
965 become a longer vector, two matrices become a wider matrix, etc."""
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
966 return numpy.hstack(fieldvalues)
257
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
967 def valuesVStack(self, fieldname, values):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
968 """Concatenate field values vertically, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
969 become a two-row matrix, two matrices become a longer matrix, etc."""
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
970 #print len(values)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
971 for v in values:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
972 if not isinstance(v, numpy.ndarray):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
973 raise TypeError(v, type(v))
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
974
257
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
975 s0 = sum([v.shape[0] for v in values])
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
976 #TODO: there's gotta be a better way to do this!
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
977 dtype = values[0].dtype
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
978 rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype)
257
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
979 cur_row = 0
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
980 for v in values:
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
981 rval[cur_row:cur_row+v.shape[0]] = v
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
982 cur_row += v.shape[0]
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
983 return rval
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
984
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
985 class ArrayDataSet(ArrayFieldsDataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
986 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
987 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
988 whose first axis iterates over examples, second axis determines fields.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
989 If the underlying array is N-dimensional (has N axes), then the field
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
990 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
991 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
992
188
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
993 def __init__(self, data_array, fields_columns, **kwargs):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
994 """
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
995 Construct an ArrayDataSet from the underlying numpy array (data) and
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
996 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
997 using the standard arguments for indexing/slicing: integer for a column index,
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
998 slice for an interval of columns (with possible stride), or iterable of column indices.
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
999 """
188
f01ac276c6fb added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 171
diff changeset
1000 ArrayFieldsDataSet.__init__(self, **kwargs)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1001 self.data=data_array
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1002 self.fields_columns=fields_columns
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1003
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1004 # check consistency and complete slices definitions
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1005 for fieldname, fieldcolumns in self.fields_columns.items():
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1006 if type(fieldcolumns) is int:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1007 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
257
4ad6bc9b4f03 beginning to hack on #20, fixing for Thierry
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 243
diff changeset
1008 if 0:
227
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1009 #I changed this because it didn't make sense to me,
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1010 # and it made it more difficult to write my learner.
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1011 # If it breaks stuff, let's talk about it.
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1012 # - James 22/05/2008
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1013 self.fields_columns[fieldname]=[fieldcolumns]
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1014 else:
17c5d080964b reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 223
diff changeset
1015 self.fields_columns[fieldname]=fieldcolumns
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1016 elif type(fieldcolumns) is slice:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1017 start,step=None,None
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1018 if not fieldcolumns.start:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1019 start=0
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1020 if not fieldcolumns.step:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1021 step=1
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1022 if start or step:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1023 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1024 elif hasattr(fieldcolumns,"__iter__"): # something like a list
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1025 for i in fieldcolumns:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1026 assert i>=0 and i<data_array.shape[1]
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1027
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1028 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1029 return self.fields_columns.keys()
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1030
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1031 def __len__(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1032 return len(self.data)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1033
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1034 def __getitem__(self,key):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1035 """More efficient implementation than the default __getitem__"""
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1036 fieldnames=self.fields_columns.keys()
243
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1037 values=self.fields_columns.values()
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1038 if type(key) is int:
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1039 return Example(fieldnames,
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1040 [numpy.asarray(self.data[key,col]) for col in values])
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1041 if type(key) is slice:
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1042 return MinibatchDataSet(Example(fieldnames,
243
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1043 [self.data[key,col] for col in values]))
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1044 if type(key) is list:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1045 for i in range(len(key)):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1046 if self.hasFields(key[i]):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1047 key[i]=self.fields_columns[key[i]]
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1048 return MinibatchDataSet(Example(fieldnames,
88
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
1049 #we must separate differently for list as numpy
128
ee5507af2c60 minor edits
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 125
diff changeset
1050 # doesn't support self.data[[i1,...],[i2,...]]
88
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
1051 # when their is more then two i1 and i2
243
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1052 [self.data[key,:][:,col]
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1053 if isinstance(col,list) else
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1054 self.data[key,col] for col in values]),
c8f19a9eb10f Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 242
diff changeset
1055
128
ee5507af2c60 minor edits
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 125
diff changeset
1056
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1057 self.valuesVStack,self.valuesHStack)
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1058
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1059 # else check for a fieldname
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1060 if self.hasFields(key):
105
8c0a1b11b007 bugfix, we keep all the line, but only a some columns
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 101
diff changeset
1061 return self.data[:,self.fields_columns[key]]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1062 # else we are trying to access a property of the dataset
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1063 assert key in self.__dict__ # else it means we are trying to access a non-existing property
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1064 return self.__dict__[key]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1065
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1066 def __iter__(self):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1067 class ArrayDataSetIterator2(object):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1068 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1069 if fieldnames is None: fieldnames = dataset.fieldNames()
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1070 # store the resulting minibatch in a lookup-list of values
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1071 self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1072 self.dataset=dataset
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1073 self.minibatch_size=minibatch_size
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1074 assert offset>=0 and offset<len(dataset.data)
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1075 assert offset+minibatch_size<=len(dataset.data)
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1076 self.current=offset
238
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1077 self.columns = [self.dataset.fields_columns[f]
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1078 for f in self.minibatch._names]
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1079 def __iter__(self):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1080 return self
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1081 def next(self):
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1082 #@todo: we suppose that we need to stop only when minibatch_size == 1.
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1083 # Otherwise, MinibatchWrapAroundIterator do it.
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1084 if self.current>=self.dataset.data.shape[0]:
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1085 raise StopIteration
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1086 sub_data = self.dataset.data[self.current]
238
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1087 self.minibatch._values = [sub_data[c] for c in self.columns]
ae1d85aca858 optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 235
diff changeset
1088
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1089 self.current+=self.minibatch_size
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1090 return self.minibatch
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1091
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1092 return ArrayDataSetIterator2(self,self.fieldNames(),1,0,0)
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1093
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1094 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1095 class ArrayDataSetIterator(object):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1096 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1097 if fieldnames is None: fieldnames = dataset.fieldNames()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1098 # store the resulting minibatch in a lookup-list of values
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1099 self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1100 self.dataset=dataset
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1101 self.minibatch_size=minibatch_size
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1102 assert offset>=0 and offset<len(dataset.data)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1103 assert offset+minibatch_size<=len(dataset.data)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1104 self.current=offset
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1105 def __iter__(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1106 return self
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1107 def next(self):
228
6f55e301c687 optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 203
diff changeset
1108 #@todo: we suppose that MinibatchWrapAroundIterator stop the iterator
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1109 sub_data = self.dataset.data[self.current:self.current+self.minibatch_size]
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1110 self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names]
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
1111 self.current+=self.minibatch_size
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1112 return self.minibatch
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1113
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1114 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1115
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1116
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1117 class CachedDataSet(DataSet):
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1118 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1119 Wrap a L{DataSet} whose values are computationally expensive to obtain
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1120 (e.g. because they involve some computation, or disk access),
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1121 so that repeated accesses to the same example are done cheaply,
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1122 by caching every example value that has been accessed at least once.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1123
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1124 Optionally, for finite-length dataset, all the values can be computed
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1125 (and cached) upon construction of the CachedDataSet, rather at the
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1126 first access.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1127
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1128 @todo: when cache_all_upon_construction create mini-batches that are as
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1129 large as possible but not so large as to fill up memory.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1130
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1131 @todo: add disk-buffering capability, so that when the cache becomes too
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1132 big for memory, we cache things on disk, trying to keep in memory only
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1133 the record most likely to be accessed next.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1134 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1135 def __init__(self,source_dataset,cache_all_upon_construction=False):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1136 self.source_dataset=source_dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1137 self.cache_all_upon_construction=cache_all_upon_construction
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1138 self.cached_examples = [] #a list of LookupList (copies)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1139 if cache_all_upon_construction:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1140 # this potentially brings all the source examples
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1141 # into memory at once, which may be too much
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1142 # the work could possibly be done by minibatches
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1143 # that are as large as possible but no more than what memory allows.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1144 fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1145 assert all([len(self)==len(fval) for fval in fields_values])
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1146 for example in fields_values.examples():
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1147 dup = copy.copy(example)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1148 self.cached_examples.append(dup)
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1149
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1150 self.fieldNames = source_dataset.fieldNames
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1151 self.hasFields = source_dataset.hasFields
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1152 self.valuesHStack = source_dataset.valuesHStack
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1153 self.valuesVStack = source_dataset.valuesVStack
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1154
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1155 def __len__(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1156 return len(self.source_dataset)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1157
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1158 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1159 class CacheIterator(object):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1160 def __init__(self,dataset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1161 self.dataset=dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1162 self.current=offset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1163 self.all_fields = self.dataset.fieldNames()==fieldnames
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1164 def __iter__(self): return self
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1165 def next(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1166 upper = self.current+minibatch_size
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1167 cache_len = len(self.dataset.cached_examples)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1168 if upper>cache_len:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1169 # whole minibatch is not already in cache
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1170 # cache everything from current length to upper
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1171 for example in self.dataset.source_dataset[cache_len:upper]:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1172 self.dataset.cached_examples.append(example)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1173
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1174 next_range = slice(self.current, self.current+minibatch_size)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1175 blah = self.dataset.cached_examples[next_range]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1176 all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah))
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1177 self.current+=minibatch_size
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1178
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1179 #little optimization to avoid second Example computation if
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1180 #possible.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1181 if self.all_fields:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1182 return all_fields_minibatch
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1183
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1184 rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1185 return rval
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1186 return CacheIterator(self)
252
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1187
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1188 def __getitem__(self,i):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1189 if type(i)==int and len(self.cached_examples)>i:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1190 return self.cached_examples[i]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1191 else:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1192 return self.source_dataset[i]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1193
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1194 def __iter__(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1195 class CacheIteratorIter(object):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1196 def __init__(self,dataset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1197 self.dataset=dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1198 self.l = len(dataset)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1199 self.current = 0
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1200 self.fieldnames = self.dataset.fieldNames()
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1201 self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1202 def __iter__(self): return self
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1203 def next(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1204 if self.current>=self.l:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1205 raise StopIteration
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1206 cache_len = len(self.dataset.cached_examples)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1207 if self.current>=cache_len: # whole minibatch is not already in cache
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1208 # cache everything from current length to upper
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1209 self.dataset.cached_examples.append(
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1210 self.dataset.source_dataset[self.current])
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1211 self.example._values = self.dataset.cached_examples[self.current]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1212 self.current+=1
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1213 return self.example
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1214
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1215 return CacheIteratorIter(self)
252
856d14dc4468 implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 251
diff changeset
1216
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1217 class ApplyFunctionDataSet(DataSet):
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1218 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1219 A L{DataSet} that contains as fields the results of applying a
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1220 given function example-wise or minibatch-wise to all the fields of
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1221 an input dataset. The output of the function should be an iterable
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1222 (e.g. a list or a LookupList) over the resulting values.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1223
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1224 The function take as input the fields of the dataset, not the examples.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1225
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1226 In minibatch mode, the function is expected to work on minibatches
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1227 (takes a minibatch in input and returns a minibatch in output). More
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1228 precisely, it means that each element of the input or output list
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1229 should be iterable and indexable over the individual example values
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1230 (typically these elements will be numpy arrays). All of the elements
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1231 in the input and output lists should have the same length, which is
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1232 the length of the minibatch.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1233
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1234 The function is applied each time an example or a minibatch is accessed.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1235 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1236
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1237 If the values_{h,v}stack functions are not provided, then
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1238 the input_dataset.values{H,V}Stack functions are used by default.
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1239 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1240 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1241 values_hstack=None,values_vstack=None,
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1242 description=None,fieldtypes=None):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1243 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1244 Constructor takes an input dataset that has as many fields as the function
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1245 expects as inputs. The resulting dataset has as many fields as the function
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1246 produces as outputs, and that should correspond to the number of output names
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1247 (provided in a list).
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1248
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1249 Note that the expected semantics of the function differs in minibatch mode
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1250 (it takes minibatches of inputs and produces minibatches of outputs, as
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1251 documented in the class comment).
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
1252
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1253 TBM: are filedtypes the old field types (from input_dataset) or the new ones
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1254 (for the new dataset created)?
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1255 """
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1256 self.input_dataset=input_dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1257 self.function=function
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1258 self.output_names=output_names
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1259 self.minibatch_mode=minibatch_mode
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1260 DataSet.__init__(self,description,fieldtypes)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1261 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1262 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1263
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1264 def __len__(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1265 return len(self.input_dataset)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1266
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1267 def fieldNames(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1268 return self.output_names
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1269
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1270 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1271 class ApplyFunctionIterator(object):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1272 def __init__(self,output_dataset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1273 self.input_dataset=output_dataset.input_dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1274 self.output_dataset=output_dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1275 self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1276 n_batches=n_batches,offset=offset).__iter__()
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1277
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1278 def __iter__(self): return self
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1279
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1280 def next(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1281 function_inputs = self.input_iterator.next()
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1282 all_output_names = self.output_dataset.output_names
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1283 if self.output_dataset.minibatch_mode:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1284 function_outputs = self.output_dataset.function(*function_inputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1285 else:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1286 input_examples = zip(*function_inputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1287 output_examples = [self.output_dataset.function(*input_example)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1288 for input_example in input_examples]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1289 function_outputs = [self.output_dataset.valuesVStack(name,values)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1290 for name,values in zip(all_output_names,
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1291 zip(*output_examples))]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1292 all_outputs = Example(all_output_names,function_outputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1293 if fieldnames==all_output_names:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1294 return all_outputs
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1295 return Example(fieldnames,[all_outputs[name] for name in fieldnames])
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1296
211
bd728c83faff in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents: 203
diff changeset
1297
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1298 return ApplyFunctionIterator(self)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1299
266
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1300 def __iter__(self): # only implemented for increased efficiency
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1301 class ApplyFunctionSingleExampleIterator(object):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1302 def __init__(self,output_dataset):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1303 self.current=0
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1304 self.output_dataset=output_dataset
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1305 self.input_iterator=output_dataset.input_dataset.__iter__()
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1306 def __iter__(self): return self
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1307 def next(self):
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1308 if self.output_dataset.minibatch_mode:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1309 function_inputs = [[input] for input in self.input_iterator.next()]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1310 outputs = self.output_dataset.function(*function_inputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1311 assert all([hasattr(output,'__iter__') for output in outputs])
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1312 function_outputs = [output[0] for output in outputs]
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1313 else:
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1314 function_inputs = self.input_iterator.next()
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1315 function_outputs = self.output_dataset.function(*function_inputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1316 return Example(self.output_dataset.output_names,function_outputs)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1317 return ApplyFunctionSingleExampleIterator(self)
6e69fb91f3c0 initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 258
diff changeset
1318
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1319
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1320 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1321 """
167
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1322 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1323 by forcing the user to define a set of fields as the 'input' field
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1324 and a set of fields as the 'target' field. Optionally, a single
4803cb76e26b Updated documentation
Joseph Turian <turian@gmail.com>
parents: 166
diff changeset
1325 weight_field can also be defined.
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1326 """
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1327 args = ((input_fields,'input'),(output_fields,'target'))
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1328 if weight_field: args+=(([weight_field],'weight'))
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1329 return src_dataset.merge_fields(*args)
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1330
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1331
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1332
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1333