annotate dataset.py @ 151:39bb21348fdf

Automated merge with ssh://p-omega1@lgcm.iro.umontreal.ca/tlearn
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Mon, 12 May 2008 15:51:43 -0400
parents 8173e196e291 9abd19af822e
children 3f627e844cba 71107b0ac860
rev   line source
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
1
12
ff4e551490f1 Added LookupList type in lookup_list.py and used it to keep order
bengioy@esprit.iro.umontreal.ca
parents: 11
diff changeset
2 from lookup_list import LookupList
ff4e551490f1 Added LookupList type in lookup_list.py and used it to keep order
bengioy@esprit.iro.umontreal.ca
parents: 11
diff changeset
3 Example = LookupList
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
4 from misc import unique_elements_list_intersection
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
5 from string import join
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
6 from sys import maxint
45
a5c70dc42972 Test functions for dataset.py
bengioy@grenat.iro.umontreal.ca
parents: 44
diff changeset
7 import numpy
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
8
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
9 class AbstractFunction (Exception): """Derived class must override this function"""
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
10 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented"""
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
11
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
12 class AttributesHolder(object):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
13 def __init__(self): pass
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
14
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
15 def attributeNames(self):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
16 raise AbstractFunction()
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
17
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
18 def setAttributes(self,attribute_names,attribute_values,make_copies=False):
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
19 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
20 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1.
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
21 """
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
22 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ):
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
23 attribute_values = [attribute_values]
110
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
24 if make_copies:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
25 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
26 self.__setattr__(name,copy.deepcopy(value))
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
27 else:
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
28 for name,value in zip(attribute_names,attribute_values):
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
29 self.__setattr__(name,value)
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
30
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
31
8fa1ef2411a0 Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents: 105
diff changeset
32 class DataSet(AttributesHolder):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
33 """A virtual base class for datasets.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
34
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
35 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
36 with learning algorithms (for training and testing them): rows/records are called examples, and
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
37 columns/attributes are called fields. The field value for a particular example can be an arbitrary
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
38 python object, which depends on the particular dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
39
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
40 We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
41 should return sys.maxint).
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
42
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
43 A DataSet is a generator of iterators; these iterators can run through the
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
44 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
45 or known length, so this class can be used to interface to a 'stream' which
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
46 feeds on-line learning (however, as noted below, some operations are not
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
47 feasible or not recommanded on streams).
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
48
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
49 To iterate over examples, there are several possibilities:
90
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
50 - for example in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
51 - for val1,val2,... in dataset:
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
52 - for example in dataset(field1, field2,field3, ...):
a289b8bed64c corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 88
diff changeset
53 - for val1,val2,val3 in dataset(field1, field2,field3):
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
54 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
55 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N):
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
56 Each of these is documented below. All of these iterators are expected
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
57 to provide, in addition to the usual 'next()' method, a 'next_index()' method
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
58 which returns a non-negative integer pointing to the position of the next
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
59 example that will be returned by 'next()' (or of the first example in the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
60 next minibatch returned). This is important because these iterators
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
61 can wrap around the dataset in order to do multiple passes through it,
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
62 in possibly unregular ways if the minibatch size is not a divisor of the
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
63 dataset length.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
64
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
65 To iterate over fields, one can do
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
66 - for field in dataset.fields():
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
67 for field_value in field: # iterate over the values associated to that field for all the dataset examples
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
68 - for field in dataset(field1,field2,...).fields() to select a subset of fields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
69 - for field in dataset.fields(field1,field2,...) to select a subset of fields
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
70 and each of these fields is iterable over the examples:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
71 - for field_examples in dataset.fields():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
72 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
73 ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
74 but when the dataset is a stream (unbounded length), it is not recommanded to do
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
75 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
76 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
132
f6505ec32dc3 Updated documentation slightly
Joseph Turian <turian@gmail.com>
parents: 128
diff changeset
77 The result of fields() is a L{DataSetFields} object, which iterates over fields,
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
78 and whose elements are iterable over examples. A DataSetFields object can
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
79 be turned back into a DataSet with its examples() method::
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
80 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
81 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
82
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
83 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
84
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
85 Note: The content of a field can be of any type. Field values can also be 'missing'
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
86 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array)
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
87 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value.
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
88 What about non-numeric values? None.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
89
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
90 Dataset elements can be indexed and sub-datasets (with a subset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
91 of examples) can be extracted. These operations are not supported
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
92 by default in the case of streams.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
93
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
94 - dataset[:n] returns a dataset with the n first examples.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
95
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
96 - dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
97
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
98 - dataset[i] returns an Example.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
99
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
100 - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
101
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
102 - dataset[fieldname] an iterable over the values of the field fieldname across
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
103 the dataset (the iterable is obtained by default by calling valuesVStack
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
104 over the values for individual examples).
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
105
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
106 - dataset.<property> returns the value of a property associated with
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
107 the name <property>. The following properties should be supported:
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
108 - 'description': a textual description or name for the dataset
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
109 - 'fieldtypes': a list of types (one per field)
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
110 A DataSet may have other attributes that it makes visible to other objects. These are
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
111 used to store information that is not example-wise but global to the dataset.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
112 The list of names of these attributes is given by the attribute_names() method.
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
113
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
114 Datasets can be concatenated either vertically (increasing the length) or
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
115 horizontally (augmenting the set of fields), if they are compatible, using
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
116 the following operations (with the same basic semantics as numpy.hstack
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
117 and numpy.vstack):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
118
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
119 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3])
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
120
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
121 creates a new dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
122 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
123
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
124 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
125
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
126 creates a new dataset that concatenates the examples from the argument datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
127 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
128 works if they all have the same fields.
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
129
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
130 According to the same logic, and viewing a DataSetFields object associated to
46
c5b07e87b0cb comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 45
diff changeset
131 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
132 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
133 examples.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
134
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
135 A dataset can hold arbitrary key-value pairs that may be used to access meta-data
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
136 or other properties of the dataset or associated with the dataset or the result
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
137 of a computation stored in a dataset. These can be accessed through the [key] syntax
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
138 when key is a string (or more specifically, neither an integer, a slice, nor a list).
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
139
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
140 A DataSet sub-class should always redefine the following methods:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
141 - __len__ if it is not a stream
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
142 - fieldNames
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
143 - minibatches_nowrap (called by DataSet.minibatches())
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
144 - valuesHStack
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
145 - valuesVStack
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
146 For efficiency of implementation, a sub-class might also want to redefine
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
147 - hasFields
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
148 - __getitem__ may not be feasible with some streams
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 66
diff changeset
149 - __iter__
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
150 A sub-class should also append attributes to self._attribute_names
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
151 (the default value returned by attributeNames()).
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
152 By convention, attributes not in attributeNames() should have a name
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
153 starting with an underscore.
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
154 @todo enforce/test that convention!
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
155 """
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
156
83
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 82
diff changeset
157 numpy_vstack = lambda fieldname,values: numpy.vstack(values)
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 82
diff changeset
158 numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
159
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
160 def __init__(self,description=None,fieldtypes=None):
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
161 if description is None:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
162 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
163 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
164 self.description=description
60
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 57
diff changeset
165 self.fieldtypes=fieldtypes
78
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
166 self._attribute_names = ["description"]
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
167 if fieldtypes:
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
168 self._attribute_names.append("fieldtypes")
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
169
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
170 def attributeNames(self): return self._attribute_names
3499918faa9d In the middle of designing TLearner
bengioy@bengiomac.local
parents: 77
diff changeset
171
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
172 class MinibatchToSingleExampleIterator(object):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
173 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
174 Converts the result of minibatch iterator with minibatch_size==1 into
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
175 single-example values in the result. Therefore the result of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
176 iterating on the dataset itself gives a sequence of single examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
177 (whereas the result of iterating over minibatches gives in each
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
178 Example field an iterable object over the individual examples in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
179 the minibatch).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
180 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
181 def __init__(self, minibatch_iterator):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
182 self.minibatch_iterator = minibatch_iterator
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
183 self.minibatch = None
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
184 def __iter__(self): #makes for loop work
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
185 return self
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
186 def next(self):
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
187 size1_minibatch = self.minibatch_iterator.next()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
188 if not self.minibatch:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
189 self.minibatch = Example(size1_minibatch.keys(),[value[0] for value in size1_minibatch.values()])
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
190 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
191 self.minibatch._values = [value[0] for value in size1_minibatch.values()]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
192 return self.minibatch
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
193
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
194 def next_index(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
195 return self.minibatch_iterator.next_index()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
196
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
197 def __iter__(self):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
198 """Supports the syntax "for i in dataset: ..."
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
199
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
200 Using this syntax, "i" will be an Example instance (or equivalent) with
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
201 all the fields of DataSet self. Every field of "i" will give access to
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
202 a field of a single example. Fields should be accessible via
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
203 i["fielname"] or i[3] (in the order defined by the elements of the
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
204 Example returned by this iterator), but the derived class is free
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
205 to accept any type of identifier, and add extra functionality to the iterator.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
206
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
207 The default implementation calls the minibatches iterator and extracts the first example of each field.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
208 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
209 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1))
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
210
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
211
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
212 class MinibatchWrapAroundIterator(object):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
213 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
214 An iterator for minibatches that handles the case where we need to wrap around the
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
215 dataset because n_batches*minibatch_size > len(dataset). It is constructed from
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
216 a dataset that provides a minibatch iterator that does not need to handle that problem.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
217 This class is a utility for dataset subclass writers, so that they do not have to handle
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
218 this issue multiple times, nor check that fieldnames are valid, nor handle the
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
219 empty fieldnames (meaning 'use all the fields').
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
220 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
221 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
222 self.dataset=dataset
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
223 self.fieldnames=fieldnames
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
224 self.minibatch_size=minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
225 self.n_batches=n_batches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
226 self.n_batches_done=0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
227 self.next_row=offset
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
228 self.offset=offset
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
229 self.L=len(dataset)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
230 assert offset+minibatch_size<=self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
231 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
232 if n_batches is not None:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
233 ds_nbatches = min(n_batches,ds_nbatches)
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
234 if fieldnames:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
235 assert dataset.hasFields(*fieldnames)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
236 else:
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
237 self.fieldnames=dataset.fieldNames()
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
238 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
239 ds_nbatches,self.next_row)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
240
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
241 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
242 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
243
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
244 def next_index(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
245 return self.next_row
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
246
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
247 def next(self):
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
248 if self.n_batches and self.n_batches_done==self.n_batches:
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
249 raise StopIteration
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
250 elif not self.n_batches and self.next_row ==self.L:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
251 raise StopIteration
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
252 upper = self.next_row+self.minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
253 if upper <=self.L:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
254 minibatch = self.iterator.next()
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
255 else:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
256 if not self.n_batches:
101
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
257 upper=min(upper, self.L)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
258 # if their is not a fixed number of batch, we continue to the end of the dataset.
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
259 # this can create a minibatch that is smaller then the minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
260 assert (self.L-self.next_row)<=self.minibatch_size
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
261 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
262 else:
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
263 # we must concatenate (vstack) the bottom and top parts of our minibatch
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
264 # first get the beginning of our minibatch (top of dataset)
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
265 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
266 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
267 minibatch = Example(self.fieldnames,
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
268 [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
a1740a99b81f by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 99
diff changeset
269 for name in self.fieldnames])
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
270 self.next_row=upper
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
271 self.n_batches_done+=1
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
272 if upper >= self.L and self.n_batches:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
273 self.next_row -= self.L
98
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
274 ds_nbatches = (self.L-self.next_row)/self.minibatch_size
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
275 if self.n_batches is not None:
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
276 ds_nbatches = min(self.n_batches,ds_nbatches)
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
277 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
7186e4f502d1 bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 95
diff changeset
278 ds_nbatches,self.next_row)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
279 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack,
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
280 self.dataset.valuesHStack),
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
281 minibatch.keys())
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
282
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
283
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
284 minibatches_fieldnames = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
285 minibatches_minibatch_size = 1
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
286 minibatches_n_batches = None
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
287 def minibatches(self,
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
288 fieldnames = minibatches_fieldnames,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
289 minibatch_size = minibatches_minibatch_size,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
290 n_batches = minibatches_n_batches,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
291 offset = 0):
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
292 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
293 Return an iterator that supports three forms of syntax:
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
294
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
295 for i in dataset.minibatches(None,**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
296
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
297 for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
298
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
299 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
300
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
301 Using the first two syntaxes, "i" will be an indexable object, such as a list,
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
302 tuple, or Example instance. In both cases, i[k] is a list-like container
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
303 of a batch of current examples. In the second case, i[0] is
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
304 list-like container of the f1 field of a batch current examples, i[1] is
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
305 a list-like container of the f2 field, etc.
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
306
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
307 Using the first syntax, all the fields will be returned in "i".
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
308 Using the third syntax, i1, i2, i3 will be list-like containers of the
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
309 f1, f2, and f3 fields of a batch of examples on each loop iteration.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
310
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
311 The minibatches iterator is expected to return upon each call to next()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
312 a DataSetFields object, which is a LookupList (indexed by the field names) whose
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
313 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
314 a sub-dataset that can be used to iterate over the individual examples
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
315 in the minibatch. Hence a minibatch can be converted back to a regular
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
316 dataset or its fields can be looked at individually (and possibly iterated over).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
317
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
318 PARAMETERS
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
319 - fieldnames (list of any type, default None):
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
320 The loop variables i1, i2, i3 (in the example above) should contain the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
321 f1, f2, and f3 fields of the current batch of examples. If None, the
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
322 derived class can choose a default, e.g. all fields.
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
323
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
324 - minibatch_size (integer, default 1)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
325 On every iteration, the variables i1, i2, i3 will have
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
326 exactly minibatch_size elements. e.g. len(i1) == minibatch_size
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
327
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
328 - n_batches (integer, default None)
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
329 The iterator will loop exactly this many times, and then stop. If None,
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
330 the derived class can choose a default. If (-1), then the returned
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
331 iterator should support looping indefinitely.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
332
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
333 - offset (integer, default 0)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
334 The iterator will start at example 'offset' in the dataset, rather than the default.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
335
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
336 Note: A list-like container is something like a tuple, list, numpy.ndarray or
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
337 any other object that supports integer indexing and slicing.
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
338
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
339 """
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
340 return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
341
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
342 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
343 """
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
344 This is the minibatches iterator generator that sub-classes must define.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
345 It does not need to worry about wrapping around multiple times across the dataset,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
346 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
347 The next() method of the returned iterator does not even need to worry about
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
348 the termination condition (as StopIteration will be raised by DataSet.minibatches
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
349 before an improper call to minibatches_nowrap's next() is made).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
350 That next() method can assert that its next row will always be within [0,len(dataset)).
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
351 The iterator returned by minibatches_nowrap does not need to implement
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
352 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator.
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
353 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
354 raise AbstractFunction()
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
355
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
356 def __len__(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
357 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
358 len(dataset) returns the number of examples in the dataset.
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
359 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint).
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
360 Sub-classes which implement finite-length datasets should redefine this method.
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
361 Some methods only make sense for finite-length datasets.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
362 """
123
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 110
diff changeset
363 return maxint
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
364
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
365 def is_unbounded(self):
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
366 """
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
367 Tests whether a dataset is unbounded (e.g. a stream).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
368 """
123
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 110
diff changeset
369 return len(self)==maxint
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
370
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
371 def hasFields(self,*fieldnames):
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
372 """
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
373 Return true if the given field name (or field names, if multiple arguments are
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
374 given) is recognized by the DataSet (i.e. can be used as a field name in one
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
375 of the iterators).
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
376
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
377 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
378 method. Many datasets may store their field names in a dictionary, which would allow more efficiency.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
379 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
380 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
381
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
382 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
383 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
384 Return the list of field names that are supported by the iterators,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
385 and for which hasFields(fieldname) would return True.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
386 """
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
387 raise AbstractFunction()
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
388
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
389 def __call__(self,*fieldnames):
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
390 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
391 Return a dataset that sees only the fields whose name are specified.
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
392 """
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
393 assert self.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
394 return self.fields(*fieldnames).examples()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
395
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
396 def fields(self,*fieldnames):
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
397 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
398 Return a DataSetFields object associated with this dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
399 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
400 return DataSetFields(self,fieldnames)
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
401
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
402 def __getitem__(self,i):
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
403 """
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
404 dataset[i] returns the (i+1)-th example of the dataset.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
405 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
406 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
407 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
408 dataset['key'] returns a property associated with the given 'key' string.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
409 If 'key' is a fieldname, then the VStacked field values (iterable over
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
410 field values) for that field is returned. Other keys may be supported
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
411 by different dataset subclasses. The following key names are encouraged:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
412 - 'description': a textual description or name for the dataset
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
413 - '<fieldname>.type': a type name or value for a given <fieldname>
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
414
39
c682c6e9bf93 Minor edits
bengioy@esprit.iro.umontreal.ca
parents: 38
diff changeset
415 Note that some stream datasets may be unable to implement random access, i.e.
c682c6e9bf93 Minor edits
bengioy@esprit.iro.umontreal.ca
parents: 38
diff changeset
416 arbitrary slicing/indexing
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
417 because they can only iterate through examples one or a minibatch at a time
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
418 and do not actually store or keep past (or future) examples.
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
419
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
420 The default implementation of getitem uses the minibatches iterator
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
421 to obtain one example, one slice, or a list of examples. It may not
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
422 always be the most efficient way to obtain the result, especially if
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
423 the data are actually stored in a memory array.
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
424 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
425 # check for an index
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
426 if type(i) is int:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
427 return DataSet.MinibatchToSingleExampleIterator(
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
428 self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
429 rows=None
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
430 # or a slice
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
431 if type(i) is slice:
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
432 if not i.start: i=slice(0,i.stop,i.step)
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
433 if not i.step: i=slice(i.start,i.stop,1)
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
434 if i.step is 1:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
435 return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
436 rows = range(i.start,i.stop,i.step)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
437 # or a list of indices
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
438 elif type(i) is list:
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
439 rows = i
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
440 if rows is not None:
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
441 examples = [self[row] for row in rows]
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
442 fields_values = zip(*examples)
45
a5c70dc42972 Test functions for dataset.py
bengioy@grenat.iro.umontreal.ca
parents: 44
diff changeset
443 return MinibatchDataSet(
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
444 Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
445 for fieldname,field_values
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
446 in zip(self.fieldNames(),fields_values)]),
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
447 self.valuesVStack,self.valuesHStack)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
448 # else check for a fieldname
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
449 if self.hasFields(i):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
450 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
451 # else we are trying to access a property of the dataset
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
452 assert i in self.__dict__ # else it means we are trying to access a non-existing property
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
453 return self.__dict__[i]
22
b6b36f65664f Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents: 20
diff changeset
454
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
455 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
456 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
457 Return a value that corresponds to concatenating (horizontally) several field values.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
458 This can be useful to merge some fields. The implementation of this operation is likely
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
459 to involve a copy of the original values. When the values are numpy arrays, the
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
460 result should be numpy.hstack(values). If it makes sense, this operation should
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
461 work as well when each value corresponds to multiple examples in a minibatch
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
462 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
463 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
464 The default is to use numpy.hstack for numpy.ndarray values, and a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
465 pointing to the original values for other data types.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
466 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
467 all_numpy=True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
468 for value in fieldvalues:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
469 if not type(value) is numpy.ndarray:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
470 all_numpy=False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
471 if all_numpy:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
472 return numpy.hstack(fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
473 # the default implementation of horizontal stacking is to put values in a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
474 return fieldvalues
26
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
475
672fe4b23032 Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents: 23
diff changeset
476
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
477 def valuesVStack(self,fieldname,values):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
478 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
479 Return a value that corresponds to concatenating (vertically) several values of the
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
480 same field. This can be important to build a minibatch out of individual examples. This
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
481 is likely to involve a copy of the original values. When the values are numpy arrays, the
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
482 result should be numpy.vstack(values).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
483 The default is to use numpy.vstack for numpy.ndarray values, and a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
484 pointing to the original values for other data types.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
485 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
486 all_numpy=True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
487 for value in values:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
488 if not type(value) is numpy.ndarray:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
489 all_numpy=False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
490 if all_numpy:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
491 return numpy.vstack(values)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
492 # the default implementation of vertical stacking is to put values in a list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
493 return values
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
494
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
495 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
496 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
497 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
498 fields of the argument datasets. This only works if they all have the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
499 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
500 return HStackedDataSet([self,other])
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
501
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
502 def __and__(self,other):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
503 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
504 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
505 (and whose length is the sum of the length of the argument datasets). This only
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
506 works if they all have the same fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
507 """
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
508 return VStackedDataSet([self,other])
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
509
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
510 def hstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
511 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
512 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
513 which is a dataset whose fields list is the concatenation of the fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
514 of the individual datasets.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
515 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
516 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
517 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
518 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
519 return HStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
520
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
521 def vstack(datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
522 """
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
523 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
524 which is a dataset which iterates first over the examples of dataset1, then
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
525 over those of dataset2, etc.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
526 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
527 assert len(datasets)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
528 if len(datasets)==1:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
529 return datasets[0]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
530 return VStackedDataSet(datasets)
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
531
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
532 class FieldsSubsetDataSet(DataSet):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
533 """
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
534 A sub-class of DataSet that selects a subset of the fields.
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
535 """
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
536 def __init__(self,src,fieldnames):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
537 self.src=src
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
538 self.fieldnames=fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
539 assert src.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
540 self.valuesHStack = src.valuesHStack
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
541 self.valuesVStack = src.valuesVStack
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
542
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
543 def __len__(self): return len(self.src)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
544
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
545 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
546 return self.fieldnames
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
547
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
548 def __iter__(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
549 class FieldsSubsetIterator(object):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
550 def __init__(self,ds):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
551 self.ds=ds
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
552 self.src_iter=ds.src.__iter__()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
553 self.example=None
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
554 def __iter__(self): return self
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
555 def next(self):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
556 complete_example = self.src_iter.next()
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
557 if self.example:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
558 self.example._values=[complete_example[field]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
559 for field in self.ds.fieldnames]
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
560 else:
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
561 self.example=Example(self.ds.fieldnames,
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
562 [complete_example[field] for field in self.ds.fieldnames])
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
563 return self.example
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
564 return FieldsSubsetIterator(self)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
565
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
566 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
567 assert self.hasFields(*fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
568 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
569 def __getitem__(self,i):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
570 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
571
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
572
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
573 class DataSetFields(LookupList):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
574 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
575 Although a DataSet iterates over examples (like rows of a matrix), an associated
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
576 DataSetFields iterates over fields (like columns of a matrix), and can be understood
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
577 as a transpose of the associated dataset.
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
578
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
579 To iterate over fields, one can do
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
580 * for fields in dataset.fields()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
581 * for fields in dataset(field1,field2,...).fields() to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
582 * for fields in dataset.fields(field1,field2,...) to select a subset of fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
583 and each of these fields is iterable over the examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
584 * for field_examples in dataset.fields():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
585 for example_value in field_examples:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
586 ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
587 but when the dataset is a stream (unbounded length), it is not recommanded to do
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
588 such things because the underlying dataset may refuse to access the different fields in
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
589 an unsynchronized ways. Hence the fields() method is illegal for streams, by default.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
590 The result of fields() is a DataSetFields object, which iterates over fields,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
591 and whose elements are iterable over examples. A DataSetFields object can
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
592 be turned back into a DataSet with its examples() method:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
593 dataset2 = dataset1.fields().examples()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
594 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
595
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
596 DataSetFields can be concatenated vertically or horizontally. To be consistent with
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
597 the syntax used for DataSets, the | concatenates the fields and the & concatenates
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
598 the examples.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
599 """
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
600 def __init__(self,dataset,fieldnames):
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
601 original_dataset=dataset
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
602 if not fieldnames:
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
603 fieldnames=dataset.fieldNames()
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
604 elif not fieldnames==dataset.fieldNames():
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
605 dataset = FieldsSubsetDataSet(dataset,fieldnames)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
606 assert dataset.hasFields(*fieldnames)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
607 self.dataset=dataset
66
dde1fb1b63ba fixed test and removed print
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 65
diff changeset
608
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
609 if isinstance(dataset,MinibatchDataSet):
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
610 LookupList.__init__(self,fieldnames,list(dataset._fields))
65
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
611 elif isinstance(original_dataset,MinibatchDataSet):
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
612 LookupList.__init__(self,fieldnames,
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
613 [original_dataset._fields[field]
d48eba49a2f4 fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 64
diff changeset
614 for field in fieldnames])
64
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
615 else:
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
616 minibatch_iterator = dataset.minibatches(fieldnames,
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
617 minibatch_size=len(dataset),
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
618 n_batches=1)
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
619 minibatch=minibatch_iterator.next()
863da25a60f1 trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 62
diff changeset
620 LookupList.__init__(self,fieldnames,minibatch)
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
621
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
622 def examples(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
623 return self.dataset
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
624
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
625 def __or__(self,other):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
626 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
627 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
628 of the list of examples of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
629 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
630 return (self.examples() + other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
631
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
632 def __and__(self,other):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
633 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
634 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
635 of the fields of DataSetFields fields1 and fields2.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
636 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
637 return (self.examples() | other.examples()).fields()
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
638
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
639
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
640 class MinibatchDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
641 """
85
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 83
diff changeset
642 Turn a LookupList of same-length (iterable) fields into an example-iterable dataset.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
643 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
644 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
645 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
646 values_hstack=DataSet().valuesHStack):
17
759d17112b23 more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
parents: 16 12
diff changeset
647 """
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
648 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
649 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
650 semantics as the DataSet methods of the same name (but without the self argument).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
651 """
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
652 self._fields=fields_lookuplist
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
653 assert len(fields_lookuplist)>0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
654 self.length=len(fields_lookuplist[0])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
655 for field in fields_lookuplist[1:]:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
656 assert self.length==len(field)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
657 self.values_vstack=values_vstack
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
658 self.values_hstack=values_hstack
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
659
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
660 def __len__(self):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
661 return self.length
28
541a273bc89f Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents: 26
diff changeset
662
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
663 def __getitem__(self,i):
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
664 if type(i) in (slice,list):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
665 return DataSetFields(MinibatchDataSet(
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
666 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
667 if type(i) is int:
85
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 83
diff changeset
668 return Example(self._fields.keys(),[field[i] for field in self._fields])
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
669 if self.hasFields(i):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
670 return self._fields[i]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
671 assert i in self.__dict__ # else it means we are trying to access a non-existing property
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
672 return self.__dict__[i]
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
673
29
46c5c90019c2 Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents: 28
diff changeset
674 def fieldNames(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
675 return self._fields.keys()
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
676
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
677 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
678 for fieldname in fieldnames:
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
679 if fieldname not in self._fields.keys():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
680 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
681 return True
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
682
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
683 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
684 class Iterator(object):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
685 def __init__(self,ds):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
686 self.ds=ds
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
687 self.next_example=offset
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
688 assert minibatch_size > 0
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
689 if offset+minibatch_size > ds.length:
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
690 raise NotImplementedError()
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
691 def __iter__(self):
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
692 return self
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
693 def next(self):
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
694 upper = self.next_example+minibatch_size
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
695 assert upper<=self.ds.length
61
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
696 minibatch = Example(self.ds._fields.keys(),
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
697 [field[self.next_example:upper]
a8b70a9117ad bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 60
diff changeset
698 for field in self.ds._fields])
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
699 self.next_example+=minibatch_size
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
700 return minibatch
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
701
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
702 return Iterator(self)
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
703
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
704 def valuesVStack(self,fieldname,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
705 return self.values_vstack(fieldname,fieldvalues)
20
266c68cb6136 Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents: 19
diff changeset
706
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
707 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
708 return self.values_hstack(fieldnames,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
709
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
710 class HStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
711 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
712 A DataSet that wraps several datasets and shows a view that includes all their fields,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
713 i.e. whose list of fields is the concatenation of their lists of fields.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
714
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
715 If a field name is found in more than one of the datasets, then either an error is
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
716 raised or the fields are renamed (either by prefixing the __name__ attribute
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
717 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
718
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
719 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
720 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
721 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
722 DataSet.__init__(self,description,field_types)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
723 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
724 self.accept_nonunique_names=accept_nonunique_names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
725 self.fieldname2dataset={}
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
726
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
727 def rename_field(fieldname,dataset,i):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
728 if hasattr(dataset,"__name__"):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
729 return dataset.__name__ + "." + fieldname
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
730 return fieldname+"."+str(i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
731
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
732 # make sure all datasets have the same length and unique field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
733 self.length=None
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
734 names_to_change=[]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
735 for i in xrange(len(datasets)):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
736 dataset = datasets[i]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
737 length=len(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
738 if self.length:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
739 assert self.length==length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
740 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
741 self.length=length
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
742 for fieldname in dataset.fieldNames():
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
743 if fieldname in self.fieldname2dataset: # name conflict!
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
744 if accept_nonunique_names:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
745 fieldname=rename_field(fieldname,dataset,i)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
746 names2change.append((fieldname,i))
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
747 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
748 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
749 self.fieldname2dataset[fieldname]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
750 for fieldname,i in names_to_change:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
751 del self.fieldname2dataset[fieldname]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
752 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
753
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
754 def hasFields(self,*fieldnames):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
755 for fieldname in fieldnames:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
756 if not fieldname in self.fieldname2dataset:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
757 return False
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
758 return True
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
759
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
760 def fieldNames(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
761 return self.fieldname2dataset.keys()
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
762
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
763 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
764
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
765 class HStackedIterator(object):
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
766 def __init__(self,hsds,iterators):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
767 self.hsds=hsds
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
768 self.iterators=iterators
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
769 def __iter__(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
770 return self
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
771 def next(self):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
772 # concatenate all the fields of the minibatches
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
773 l=LookupList()
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
774 for iter in self.iterators:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
775 l.append_lookuplist(iter.next())
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
776 return l
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
777
125
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 123
diff changeset
778 assert self.hasFields(*fieldnames)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
779 # find out which underlying datasets are necessary to service the required fields
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
780 # and construct corresponding minibatch iterators
140
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 136
diff changeset
781 if fieldnames and fieldnames!=self.fieldNames():
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
782 datasets=set([])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
783 fields_in_dataset=dict([(dataset,[]) for dataset in datasets])
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
784 for fieldname in fieldnames:
136
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 125
diff changeset
785 dataset=self.datasets[self.fieldname2dataset[fieldname]]
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
786 datasets.add(dataset)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
787 fields_in_dataset[dataset].append(fieldname)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
788 datasets=list(datasets)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
789 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
790 for dataset in datasets]
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
791 else:
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
792 datasets=self.datasets
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
793 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets]
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
794 return HStackedIterator(self,iterators)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
795
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
796
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
797 def valuesVStack(self,fieldname,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
798 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
799
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
800 def valuesHStack(self,fieldnames,fieldvalues):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
801 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
802 We will use the sub-dataset associated with the first fieldname in the fieldnames list
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
803 to do the work, hoping that it can cope with the other values (i.e. won't care
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
804 about the incompatible fieldnames). Hence this heuristic will always work if
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
805 all the fieldnames are of the same sub-dataset.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
806 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
807 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
808
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
809 class VStackedDataSet(DataSet):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
810 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
811 A DataSet that wraps several datasets and shows a view that includes all their examples,
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
812 in the order provided. This clearly assumes that they all have the same field names
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
813 and all (except possibly the last one) are of finite length.
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
814
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
815 TODO: automatically detect a chain of stacked datasets due to A + B + C + D ...
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
816 """
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
817 def __init__(self,datasets):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
818 self.datasets=datasets
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
819 self.length=0
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
820 self.index2dataset={}
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
821 assert len(datasets)>0
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
822 fieldnames = datasets[-1].fieldNames()
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
823 self.datasets_start_row=[]
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
824 # We use this map from row index to dataset index for constant-time random access of examples,
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
825 # to avoid having to search for the appropriate dataset each time and slice is asked for.
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
826 for dataset,k in enumerate(datasets[0:-1]):
48
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
827 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length).
b6730f9a336d Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents: 46
diff changeset
828 L=len(dataset)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
829 for i in xrange(L):
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
830 self.index2dataset[self.length+i]=k
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
831 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
832 self.length+=L
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
833 assert dataset.fieldNames()==fieldnames
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
834 self.datasets_start_row.append(self.length)
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
835 self.length+=len(datasets[-1])
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
836 # If length is very large, we should use a more memory-efficient mechanism
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
837 # that does not store all indices
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
838 if self.length>1000000:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
839 # 1 million entries would require about 60 meg for the index2dataset map
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
840 # TODO
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
841 print "A more efficient mechanism for index2dataset should be implemented"
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
842
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
843 def __len__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
844 return self.length
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
845
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
846 def fieldNames(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
847 return self.datasets[0].fieldNames()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
848
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
849 def hasFields(self,*fieldnames):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
850 return self.datasets[0].hasFields(*fieldnames)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
851
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
852 def locate_row(self,row):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
853 """Return (dataset_index, row_within_dataset) for global row number"""
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
854 dataset_index = self.index2dataset[row]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
855 row_within_dataset = self.datasets_start_row[dataset_index]
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
856 return dataset_index, row_within_dataset
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
857
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
858 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
859
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
860 class VStackedIterator(object):
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
861 def __init__(self,vsds):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
862 self.vsds=vsds
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
863 self.next_row=offset
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
864 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
865 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
866 self.next_iterator(vsds.datasets[0],offset,n_batches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
867
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
868 def next_iterator(self,dataset,starting_offset,batches_left):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
869 L=len(dataset)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
870 ds_nbatches = (L-starting_offset)/minibatch_size
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
871 if batches_left is not None:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
872 ds_nbatches = max(batches_left,ds_nbatches)
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
873 if minibatch_size>L:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
874 ds_minibatch_size=L
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
875 n_left_in_mb=minibatch_size-L
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
876 ds_nbatches=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
877 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
878 n_left_in_mb=0
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
879 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
880 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
881
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
882 def move_to_next_dataset(self):
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
883 if self.n_left_at_the_end_of_ds>0:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
884 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
885 self.next_iterator(vsds.datasets[self.next_dataset_index],
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
886 self.n_left_at_the_end_of_ds,1)
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
887 else:
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
888 self.next_dataset_index +=1
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
889 if self.next_dataset_index==len(self.vsds.datasets):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
890 self.next_dataset_index = 0
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
891 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
892 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
893
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
894 def __iter__(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
895 return self
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
896
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
897 def next(self):
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
898 dataset=self.vsds.datasets[self.next_dataset_index]
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
899 mb = self.next_iterator.next()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
900 if self.n_left_in_mb:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
901 extra_mb = []
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
902 while self.n_left_in_mb>0:
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
903 self.move_to_next_dataset()
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
904 extra_mb.append(self.next_iterator.next())
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
905 mb = Example(fieldnames,
40
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
906 [dataset.valuesVStack(name,
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
907 [mb[name]]+[b[name] for b in extra_mb])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
908 for name in fieldnames])
88fd1cce08b9 replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents: 39
diff changeset
909
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
910 self.next_row+=minibatch_size
38
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
911 self.next_dataset_row+=minibatch_size
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
912 if self.next_row+minibatch_size>len(dataset):
d637ad8f7352 Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents: 37
diff changeset
913 self.move_to_next_dataset()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
914 return examples
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
915 return VStackedIterator(self)
37
73c4212ba5b3 Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents: 36
diff changeset
916
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
917 class ArrayFieldsDataSet(DataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
918 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
919 Virtual super-class of datasets whose field values are numpy array,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
920 thus defining valuesHStack and valuesVStack for sub-classes.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
921 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
922 def __init__(self,description=None,field_types=None):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
923 DataSet.__init__(self,description,field_types)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
924 def valuesHStack(self,fieldnames,fieldvalues):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
925 """Concatenate field values horizontally, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
926 become a longer vector, two matrices become a wider matrix, etc."""
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
927 return numpy.hstack(fieldvalues)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
928 def valuesVStack(self,fieldname,values):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
929 """Concatenate field values vertically, e.g. two vectors
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
930 become a two-row matrix, two matrices become a longer matrix, etc."""
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
931 return numpy.vstack(values)
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
932
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
933 class ArrayDataSet(ArrayFieldsDataSet):
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
934 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
935 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
936 whose first axis iterates over examples, second axis determines fields.
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
937 If the underlying array is N-dimensional (has N axes), then the field
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
938 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2).
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
939 """
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
940
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
941 def __init__(self, data_array, fields_columns):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
942 """
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
943 Construct an ArrayDataSet from the underlying numpy array (data) and
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
944 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
945 using the standard arguments for indexing/slicing: integer for a column index,
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
946 slice for an interval of columns (with possible stride), or iterable of column indices.
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
947 """
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
948 self.data=data_array
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
949 self.fields_columns=fields_columns
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
950
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
951 # check consistency and complete slices definitions
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
952 for fieldname, fieldcolumns in self.fields_columns.items():
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
953 if type(fieldcolumns) is int:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
954 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
99
a8da709eb6a9 in ArrayDataSet.__init__ if a columns is an index, we change it to be a list that containt only this index. This way, we remove the special case where the columns is an index for all subsequent call.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 98
diff changeset
955 self.fields_columns[fieldname]=[fieldcolumns]
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
956 elif type(fieldcolumns) is slice:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
957 start,step=None,None
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
958 if not fieldcolumns.start:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
959 start=0
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
960 if not fieldcolumns.step:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
961 step=1
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
962 if start or step:
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
963 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
964 elif hasattr(fieldcolumns,"__iter__"): # something like a list
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
965 for i in fieldcolumns:
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
966 assert i>=0 and i<data_array.shape[1]
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
967
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
968 def fieldNames(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
969 return self.fields_columns.keys()
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
970
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
971 def __len__(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
972 return len(self.data)
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
973
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
974 def __getitem__(self,key):
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
975 """More efficient implementation than the default __getitem__"""
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
976 fieldnames=self.fields_columns.keys()
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
977 if type(key) is int:
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
978 return Example(fieldnames,
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
979 [self.data[key,self.fields_columns[f]] for f in fieldnames])
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
980 if type(key) is slice:
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
981 return MinibatchDataSet(Example(fieldnames,
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
982 [self.data[key,self.fields_columns[f]] for f in fieldnames]))
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
983 if type(key) is list:
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
984 for i in range(len(key)):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
985 if self.hasFields(key[i]):
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
986 key[i]=self.fields_columns[key[i]]
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
987 return MinibatchDataSet(Example(fieldnames,
88
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
988 #we must separate differently for list as numpy
128
ee5507af2c60 minor edits
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 125
diff changeset
989 # doesn't support self.data[[i1,...],[i2,...]]
88
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
990 # when their is more then two i1 and i2
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
991 [self.data[key,:][:,self.fields_columns[f]]
6749d18e11c8 bugfix as numpy numpy don't support self.data[[i1,...],[i2,...]] when their is more then two i1 and i2
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 85
diff changeset
992 if isinstance(self.fields_columns[f],list) else
128
ee5507af2c60 minor edits
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 125
diff changeset
993 self.data[key,self.fields_columns[f]] for f in fieldnames]),
ee5507af2c60 minor edits
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 125
diff changeset
994
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
995 self.valuesVStack,self.valuesHStack)
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
996
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
997 # else check for a fieldname
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
998 if self.hasFields(key):
105
8c0a1b11b007 bugfix, we keep all the line, but only a some columns
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 101
diff changeset
999 return self.data[:,self.fields_columns[key]]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1000 # else we are trying to access a property of the dataset
80
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1001 assert key in self.__dict__ # else it means we are trying to access a non-existing property
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 66
diff changeset
1002 return self.__dict__[key]
55
66619ce44497 Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 48
diff changeset
1003
41
283e95c15b47 Added ArrayDataSet
bengioy@grenat.iro.umontreal.ca
parents: 40
diff changeset
1004
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1005 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1006 class ArrayDataSetIterator(object):
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1007 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1008 if fieldnames is None: fieldnames = dataset.fieldNames()
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1009 # store the resulting minibatch in a lookup-list of values
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1010 self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1011 self.dataset=dataset
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1012 self.minibatch_size=minibatch_size
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1013 assert offset>=0 and offset<len(dataset.data)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1014 assert offset+minibatch_size<=len(dataset.data)
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1015 self.current=offset
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1016 def __iter__(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1017 return self
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1018 def next(self):
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1019 sub_data = self.dataset.data[self.current:self.current+self.minibatch_size]
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1020 self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names]
43
e92244f30116 Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents: 42
diff changeset
1021 self.current+=self.minibatch_size
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1022 return self.minibatch
42
9b68774fcc6b Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents: 41
diff changeset
1023
44
5a85fda9b19b Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents: 43
diff changeset
1024 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1025
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1026
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1027 class CachedDataSet(DataSet):
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1028 """
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1029 Wrap a dataset whose values are computationally expensive to obtain
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1030 (e.g. because they involve some computation, or disk access),
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1031 so that repeated accesses to the same example are done cheaply,
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1032 by caching every example value that has been accessed at least once.
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1033
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1034 Optionally, for finite-length dataset, all the values can be computed
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1035 (and cached) upon construction of the CachedDataSet, rather at the
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1036 first access.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1037
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1038 @todo when cache_all_upon_construction create mini-batches that are as
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1039 large as possible but not so large as to fill up memory.
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1040
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1041 @todo add disk-buffering capability, so that when the cache becomes too
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1042 big for memory, we cache things on disk, trying to keep in memory only
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1043 the record most likely to be accessed next.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1044 """
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1045 def __init__(self,source_dataset,cache_all_upon_construction=False):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1046 self.source_dataset=source_dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1047 self.cache_all_upon_construction=cache_all_upon_construction
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1048 if cache_all_upon_construction:
77
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1049 # this potentially brings all the source examples
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1050 # into memory at once, which may be too much
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1051 # the work could possibly be done by minibatches
1e2bb5bad636 toying with different ways to implement learners
bengioy@bengiomac.local
parents: 74
diff changeset
1052 # that are as large as possible but no more than what memory allows.
146
8173e196e291 Trying to make CacheDataSet work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 144
diff changeset
1053 fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
8173e196e291 Trying to make CacheDataSet work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 144
diff changeset
1054 self.cached_examples = zip(*fields_values)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1055 else:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1056 self.cached_examples = []
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1057
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1058 self.fieldNames = source_dataset.fieldNames
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1059 self.hasFields = source_dataset.hasFields
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1060 self.valuesHStack = source_dataset.valuesHStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1061 self.valuesVStack = source_dataset.valuesVStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1062
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1063 def __len__(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1064 return len(self.source_dataset)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1065
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1066 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1067 class CacheIterator(object):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1068 def __init__(self,dataset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1069 self.dataset=dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1070 self.current=offset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1071 def __iter__(self): return self
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1072 def next(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1073 upper = self.current+minibatch_size
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1074 cache_len = len(self.dataset.cached_examples)
135
0d8e721cc63c Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 134
diff changeset
1075 if upper>cache_len: # whole minibatch is not already in cache
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1076 # cache everything from current length to upper
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1077 for example in self.dataset.source_dataset[cache_len:upper]:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1078 self.dataset.cached_examples.append(example)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1079 all_fields_minibatch = Example(self.dataset.fieldNames(),
150
Frederic Bastien <bastienf@iro.umontreal.ca>
parents: 144
diff changeset
1080 *self.dataset.cached_examples[self.current:self.current+minibatch_size])
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1081 if self.dataset.fieldNames()==fieldnames:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1082 return all_fields_minibatch
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1083 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1084 return CacheIterator(self)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1085
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1086
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1087 class ApplyFunctionDataSet(DataSet):
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1088 """
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1089 A dataset that contains as fields the results of applying a given function
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1090 example-wise or minibatch-wise to all the fields of an input dataset.
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1091 The output of the function should be an iterable (e.g. a list or a LookupList)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1092 over the resulting values.
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1093
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1094 In minibatch mode, the function is expected to work on minibatches (takes
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1095 a minibatch in input and returns a minibatch in output). More precisely,
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1096 it means that each element of the input or output list should be iterable
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1097 and indexable over the individual example values (typically these
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1098 elements will be numpy arrays). All of the elements in the input and
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1099 output lists should have the same length, which is the length of the
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1100 minibatch.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1101
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1102 The function is applied each time an example or a minibatch is accessed.
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1103 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1104
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1105 If the values_{h,v}stack functions are not provided, then
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1106 the input_dataset.values{H,V}Stack functions are used by default.
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1107 """
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1108 def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
74
b4159cbdc06b Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 73
diff changeset
1109 values_hstack=None,values_vstack=None,
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1110 description=None,fieldtypes=None):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1111 """
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1112 Constructor takes an input dataset that has as many fields as the function
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1113 expects as inputs. The resulting dataset has as many fields as the function
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1114 produces as outputs, and that should correspond to the number of output names
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1115 (provided in a list).
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1116
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1117 Note that the expected semantics of the function differs in minibatch mode
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1118 (it takes minibatches of inputs and produces minibatches of outputs, as
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1119 documented in the class comment).
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1120 """
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1121 self.input_dataset=input_dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1122 self.function=function
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1123 self.output_names=output_names
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1124 self.minibatch_mode=minibatch_mode
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1125 DataSet.__init__(self,description,fieldtypes)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1126 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1127 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1128
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1129 def __len__(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1130 return len(self.input_dataset)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1131
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1132 def fieldNames(self):
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1133 return self.output_names
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1134
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1135 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1136 class ApplyFunctionIterator(object):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1137 def __init__(self,output_dataset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1138 self.input_dataset=output_dataset.input_dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1139 self.output_dataset=output_dataset
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1140 self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1141 n_batches=n_batches,offset=offset).__iter__()
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1142
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1143 def __iter__(self): return self
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1144
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1145 def next(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1146 function_inputs = self.input_iterator.next()
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1147 all_output_names = self.output_dataset.output_names
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1148 if self.output_dataset.minibatch_mode:
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1149 function_outputs = self.output_dataset.function(*function_inputs)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1150 else:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1151 input_examples = zip(*function_inputs)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1152 output_examples = [self.output_dataset.function(input_example)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1153 for input_example in input_examples]
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1154 function_outputs = [self.output_dataset.valuesVStack(name,values)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1155 for name,values in zip(all_output_names,
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1156 zip(*output_examples))]
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1157 all_outputs = Example(all_output_names,function_outputs)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1158 if fieldnames==all_output_names:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1159 return all_outputs
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1160 return Example(fieldnames,[all_outputs[name] for name in fieldnames])
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1161
134
3f4e5c9bdc5e Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 132
diff changeset
1162 return ApplyFunctionIterator(self)
73
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1163
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1164 def __iter__(self): # only implemented for increased efficiency
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1165 class ApplyFunctionSingleExampleIterator(object):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1166 def __init__(self,output_dataset):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1167 self.current=0
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1168 self.output_dataset=output_dataset
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1169 self.input_iterator=output_dataset.input_dataset.__iter__()
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1170 def __iter__(self): return self
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1171 def next(self):
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1172 function_inputs = self.input_iterator.next()
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1173 if self.output_dataset.minibatch_mode:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1174 function_outputs = [output[0] for output in self.output_dataset.function(function_inputs)]
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1175 else:
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1176 function_outputs = self.output_dataset.function(function_inputs)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1177 return Example(self.output_dataset.output_names,function_outputs)
69f97aad3faf Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents: 72
diff changeset
1178 return ApplyFunctionSingleExampleIterator(self)
57
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1179
1aabd2e2bb5f Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 56
diff changeset
1180
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1181 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1182 """
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1183 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1184 user to define a set of fields as the 'input' field and a set of fields
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1185 as the 'target' field. Optionally, a single weight_field can also be defined.
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1186 """
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1187 args = ((input_fields,'input'),(output_fields,'target'))
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1188 if weight_field: args+=(([weight_field],'weight'))
36
438440ba0627 Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents: 29
diff changeset
1189 return src_dataset.merge_fields(*args)
23
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1190
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1191
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1192
526e192b0699 Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents: 22
diff changeset
1193