Mercurial > pylearn
annotate dataset.py @ 434:0f366ecb11ee
log2->log in cost
author | Olivier Breuleux <breuleuo@iro.umontreal.ca> |
---|---|
date | Mon, 04 Aug 2008 16:21:59 -0400 |
parents | 835830e52b42 |
children | 32c5f87bc54e |
rev | line source |
---|---|
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
1 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
2 from lookup_list import LookupList as Example |
356
18702ceb2096
Added more functions
Joseph Turian <turian@iro.umontreal.ca>
parents:
354
diff
changeset
|
3 from common.misc import unique_elements_list_intersection |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
4 from string import join |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
5 from sys import maxint |
171
895b4b60f5e8
bugfix. Otherwise the example was writed over and not a new one was returned
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
167
diff
changeset
|
6 import numpy, copy |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
7 |
166 | 8 from exceptions import * |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
9 |
110
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
10 class AttributesHolder(object): |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
11 def __init__(self): pass |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
12 |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
13 def attributeNames(self): |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
14 raise AbstractFunction() |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
15 |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
16 def setAttributes(self,attribute_names,attribute_values,make_copies=False): |
134
3f4e5c9bdc5e
Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
132
diff
changeset
|
17 """ |
3f4e5c9bdc5e
Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
132
diff
changeset
|
18 Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1. |
3f4e5c9bdc5e
Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
132
diff
changeset
|
19 """ |
3f4e5c9bdc5e
Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
132
diff
changeset
|
20 if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ): |
3f4e5c9bdc5e
Fixes to ApplyFunctionDataSet and other things to make learner and mlp work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
132
diff
changeset
|
21 attribute_values = [attribute_values] |
110
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
22 if make_copies: |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
23 for name,value in zip(attribute_names,attribute_values): |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
24 self.__setattr__(name,copy.deepcopy(value)) |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
25 else: |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
26 for name,value in zip(attribute_names,attribute_values): |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
27 self.__setattr__(name,value) |
193
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
28 |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
29 def getAttributes(self,attribute_names=None, return_copy=False): |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
30 """ |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
31 Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes. |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
32 """ |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
33 if attribute_names is None: |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
34 attribute_names = self.attributeNames() |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
35 if return_copy: |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
36 return [copy.copy(self.__getattribute__(name)) for name in attribute_names] |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
37 else: |
cb6b945acf5a
Complete redesign of learner...
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
188
diff
changeset
|
38 return [self.__getattribute__(name) for name in attribute_names] |
110
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
39 |
8fa1ef2411a0
Worked on OneShotTLearner and implementation of LinearRegression
bengioy@bengiomac.local
parents:
105
diff
changeset
|
40 class DataSet(AttributesHolder): |
16 | 41 """A virtual base class for datasets. |
42 | |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
43 A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
44 with learning algorithms (for training and testing them): rows/records are called examples, and |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
45 columns/attributes are called fields. The field value for a particular example can be an arbitrary |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
46 python object, which depends on the particular dataset. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
47 |
241
ddb88a8e9fd2
If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents:
231
diff
changeset
|
48 We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
49 should return sys.maxint). |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
50 |
16 | 51 A DataSet is a generator of iterators; these iterators can run through the |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
52 examples or the fields in a variety of ways. A DataSet need not necessarily have a finite |
16 | 53 or known length, so this class can be used to interface to a 'stream' which |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
54 feeds on-line learning (however, as noted below, some operations are not |
241
ddb88a8e9fd2
If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents:
231
diff
changeset
|
55 feasible or not recommended on streams). |
16 | 56 |
57 To iterate over examples, there are several possibilities: | |
90
a289b8bed64c
corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
88
diff
changeset
|
58 - for example in dataset: |
a289b8bed64c
corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
88
diff
changeset
|
59 - for val1,val2,... in dataset: |
a289b8bed64c
corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
88
diff
changeset
|
60 - for example in dataset(field1, field2,field3, ...): |
a289b8bed64c
corrected comment
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
88
diff
changeset
|
61 - for val1,val2,val3 in dataset(field1, field2,field3): |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
62 - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
63 - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
64 Each of these is documented below. All of these iterators are expected |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
65 to provide, in addition to the usual 'next()' method, a 'next_index()' method |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
66 which returns a non-negative integer pointing to the position of the next |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
67 example that will be returned by 'next()' (or of the first example in the |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
68 next minibatch returned). This is important because these iterators |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
69 can wrap around the dataset in order to do multiple passes through it, |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
70 in possibly unregular ways if the minibatch size is not a divisor of the |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
71 dataset length. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
72 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
73 To iterate over fields, one can do |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
74 - for field in dataset.fields(): |
46
c5b07e87b0cb
comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
45
diff
changeset
|
75 for field_value in field: # iterate over the values associated to that field for all the dataset examples |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
76 - for field in dataset(field1,field2,...).fields() to select a subset of fields |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
77 - for field in dataset.fields(field1,field2,...) to select a subset of fields |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
78 and each of these fields is iterable over the examples: |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
79 - for field_examples in dataset.fields(): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
80 for example_value in field_examples: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
81 ... |
241
ddb88a8e9fd2
If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents:
231
diff
changeset
|
82 but when the dataset is a stream (unbounded length), it is not recommended to do |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
83 such things because the underlying dataset may refuse to access the different fields in |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
84 an unsynchronized ways. Hence the fields() method is illegal for streams, by default. |
132
f6505ec32dc3
Updated documentation slightly
Joseph Turian <turian@gmail.com>
parents:
128
diff
changeset
|
85 The result of fields() is a L{DataSetFields} object, which iterates over fields, |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
86 and whose elements are iterable over examples. A DataSetFields object can |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
87 be turned back into a DataSet with its examples() method:: |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
88 dataset2 = dataset1.fields().examples() |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
89 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
90 |
16 | 91 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. |
92 | |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
93 Note: The content of a field can be of any type. Field values can also be 'missing' |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
94 (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) |
46
c5b07e87b0cb
comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
45
diff
changeset
|
95 fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. |
c5b07e87b0cb
comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
45
diff
changeset
|
96 What about non-numeric values? None. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
97 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
98 Dataset elements can be indexed and sub-datasets (with a subset |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
99 of examples) can be extracted. These operations are not supported |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
100 by default in the case of streams. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
101 |
317
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
102 - dataset[:n] returns an Example with the n first examples. |
16 | 103 |
317
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
104 - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
105 |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
106 - dataset[i] returns an Example. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
107 |
378 | 108 - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in. |
317
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
109 |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
110 A similar command gives you a DataSet instead of Examples : |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
111 |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
112 - dataset.subset[:n] returns a DataSet with the n first examples. |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
113 |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
114 - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s. |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
115 |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
116 - dataset.subset[i] returns a DataSet. |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
117 |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
118 - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in. |
14081904d8f3
doc updated regarding __getitem__ returning LookupList and .subset returning a DataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
316
diff
changeset
|
119 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
120 |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
121 - dataset.<property> returns the value of a property associated with |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
122 the name <property>. The following properties should be supported: |
41 | 123 - 'description': a textual description or name for the dataset |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
124 - 'fieldtypes': a list of types (one per field) |
78 | 125 A DataSet may have other attributes that it makes visible to other objects. These are |
126 used to store information that is not example-wise but global to the dataset. | |
127 The list of names of these attributes is given by the attribute_names() method. | |
41 | 128 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
129 Datasets can be concatenated either vertically (increasing the length) or |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
130 horizontally (augmenting the set of fields), if they are compatible, using |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
131 the following operations (with the same basic semantics as numpy.hstack |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
132 and numpy.vstack): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
133 |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
134 - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
135 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
136 creates a new dataset whose list of fields is the concatenation of the list of |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
137 fields of the argument datasets. This only works if they all have the same length. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
138 |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
139 - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
140 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
141 creates a new dataset that concatenates the examples from the argument datasets |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
142 (and whose length is the sum of the length of the argument datasets). This only |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
143 works if they all have the same fields. |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
144 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
145 According to the same logic, and viewing a DataSetFields object associated to |
46
c5b07e87b0cb
comments modif made by Yoshua
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
45
diff
changeset
|
146 a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
147 a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
148 examples. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
149 |
41 | 150 A dataset can hold arbitrary key-value pairs that may be used to access meta-data |
151 or other properties of the dataset or associated with the dataset or the result | |
152 of a computation stored in a dataset. These can be accessed through the [key] syntax | |
153 when key is a string (or more specifically, neither an integer, a slice, nor a list). | |
78 | 154 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
155 A DataSet sub-class should always redefine the following methods: |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
156 - __len__ if it is not a stream |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
157 - fieldNames |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
158 - minibatches_nowrap (called by DataSet.minibatches()) |
269
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
159 For efficiency of implementation, a sub-class might also want to redefine |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
160 - valuesHStack |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
161 - valuesVStack |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
162 - hasFields |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
163 - __getitem__ may not be feasible with some streams |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
66
diff
changeset
|
164 - __iter__ |
78 | 165 A sub-class should also append attributes to self._attribute_names |
166 (the default value returned by attributeNames()). | |
167 By convention, attributes not in attributeNames() should have a name | |
168 starting with an underscore. | |
169 @todo enforce/test that convention! | |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
170 """ |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
171 |
83 | 172 numpy_vstack = lambda fieldname,values: numpy.vstack(values) |
173 numpy_hstack = lambda fieldnames,values: numpy.hstack(values) | |
77
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
174 |
292 | 175 def __init__(self, description=None, fieldnames=None, fieldtypes=None): |
176 """ | |
177 @type fieldnames: list of strings | |
178 @type fieldtypes: list of python types, same length as fieldnames | |
179 @type description: string | |
180 @param description: description/name for this dataset | |
181 """ | |
182 def default_desc(): | |
183 return type(self).__name__ \ | |
184 + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" | |
185 | |
186 #self.fieldnames = fieldnames | |
187 | |
188 self.fieldtypes = fieldtypes if fieldtypes is not None \ | |
189 else [None]*1 #len(fieldnames) | |
190 | |
191 self.description = default_desc() if description is None \ | |
192 else description | |
78 | 193 self._attribute_names = ["description"] |
292 | 194 |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
195 |
292 | 196 attributeNames = property(lambda self: copy.copy(self._attribute_names)) |
197 | |
198 def __contains__(self, fieldname): | |
199 return (fieldname in self.fieldNames()) \ | |
200 or (fieldname in self.attributeNames()) | |
201 | |
202 def __iter__(self): | |
203 """Supports the syntax "for i in dataset: ..." | |
78 | 204 |
292 | 205 Using this syntax, "i" will be an Example instance (or equivalent) with |
206 all the fields of DataSet self. Every field of "i" will give access to | |
207 a field of a single example. Fields should be accessible via | |
208 i["fielname"] or i[3] (in the order defined by the elements of the | |
209 Example returned by this iterator), but the derived class is free | |
210 to accept any type of identifier, and add extra functionality to the iterator. | |
211 | |
212 The default implementation calls the minibatches iterator and extracts the first example of each field. | |
213 """ | |
214 return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) | |
215 | |
216 def __len__(self): | |
217 """ | |
218 len(dataset) returns the number of examples in the dataset. | |
219 By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). | |
220 Sub-classes which implement finite-length datasets should redefine this method. | |
221 Some methods only make sense for finite-length datasets. | |
222 """ | |
223 return None | |
224 | |
78 | 225 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
226 class MinibatchToSingleExampleIterator(object): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
227 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
228 Converts the result of minibatch iterator with minibatch_size==1 into |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
229 single-example values in the result. Therefore the result of |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
230 iterating on the dataset itself gives a sequence of single examples |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
231 (whereas the result of iterating over minibatches gives in each |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
232 Example field an iterable object over the individual examples in |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
233 the minibatch). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
234 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
235 def __init__(self, minibatch_iterator): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
236 self.minibatch_iterator = minibatch_iterator |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
237 self.minibatch = None |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
238 def __iter__(self): #makes for loop work |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
239 return self |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
240 def next(self): |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
241 size1_minibatch = self.minibatch_iterator.next() |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
242 if not self.minibatch: |
329
9ce791fb2cbf
little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
322
diff
changeset
|
243 names = size1_minibatch.keys() |
9ce791fb2cbf
little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
322
diff
changeset
|
244 # next lines are a hack, but there was problem when we were getting [array(327)] for instance |
332
dada08a6adb8
redone my previous hack in MinibatchToSingleExampleIterator, tests should work again
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
331
diff
changeset
|
245 try: |
329
9ce791fb2cbf
little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
322
diff
changeset
|
246 values = [value[0] for value in size1_minibatch.values()] |
332
dada08a6adb8
redone my previous hack in MinibatchToSingleExampleIterator, tests should work again
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
331
diff
changeset
|
247 except : |
329
9ce791fb2cbf
little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
322
diff
changeset
|
248 values = [value for value in size1_minibatch.values()] |
9ce791fb2cbf
little hack in MiniBatchToSingleExampleIterator, there was a problem which I think was not a bug, we were receiving [array(3)] and everything was crashing. Hack is kinda slow
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
322
diff
changeset
|
249 self.minibatch = Example(names,values) |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
250 else: |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
251 self.minibatch._values = [value[0] for value in size1_minibatch.values()] |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
252 return self.minibatch |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
253 |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
254 def next_index(self): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
255 return self.minibatch_iterator.next_index() |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
256 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
257 class MinibatchWrapAroundIterator(object): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
258 """ |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
259 An iterator for minibatches that handles the case where we need to wrap around the |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
260 dataset because n_batches*minibatch_size > len(dataset). It is constructed from |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
261 a dataset that provides a minibatch iterator that does not need to handle that problem. |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
262 This class is a utility for dataset subclass writers, so that they do not have to handle |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
263 this issue multiple times, nor check that fieldnames are valid, nor handle the |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
264 empty fieldnames (meaning 'use all the fields'). |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
265 """ |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
266 def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
267 self.dataset=dataset |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
268 self.fieldnames=fieldnames |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
269 self.minibatch_size=minibatch_size |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
270 self.n_batches=n_batches |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
271 self.n_batches_done=0 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
272 self.next_row=offset |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
273 self.L=len(dataset) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
274 self.offset=offset % self.L |
98
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
275 ds_nbatches = (self.L-self.next_row)/self.minibatch_size |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
276 if n_batches is not None: |
98
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
277 ds_nbatches = min(n_batches,ds_nbatches) |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
278 if fieldnames: |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
279 assert dataset.hasFields(*fieldnames) |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
280 else: |
98
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
281 self.fieldnames=dataset.fieldNames() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
282 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
283 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
284 def __iter__(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
285 return self |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
286 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
287 def next_index(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
288 return self.next_row |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
289 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
290 def next(self): |
43
e92244f30116
Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents:
42
diff
changeset
|
291 if self.n_batches and self.n_batches_done==self.n_batches: |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
292 raise StopIteration |
101
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
293 elif not self.n_batches and self.next_row ==self.L: |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
294 raise StopIteration |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
295 upper = self.next_row+self.minibatch_size |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
296 if upper <=self.L: |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
297 minibatch = self.iterator.next() |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
298 else: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
299 if not self.n_batches: |
101
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
300 upper=min(upper, self.L) |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
301 # if their is not a fixed number of batch, we continue to the end of the dataset. |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
302 # this can create a minibatch that is smaller then the minibatch_size |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
303 assert (self.L-self.next_row)<=self.minibatch_size |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
304 minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
305 else: |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
306 # we must concatenate (vstack) the bottom and top parts of our minibatch |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
307 # first get the beginning of our minibatch (top of dataset) |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
308 first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
309 second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() |
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
310 minibatch = Example(self.fieldnames, |
268
3f1cd8897fda
reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
311 [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) |
101
a1740a99b81f
by default, in a minibatch without any fixed number of batchs, we need to finish at the end of the dataset. Now we return a minibatch at the end event if this minibacht size != the gived minibatch_size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
99
diff
changeset
|
312 for name in self.fieldnames]) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
313 self.next_row=upper |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
314 self.n_batches_done+=1 |
43
e92244f30116
Corrected iterator logic errors
bengioy@grenat.iro.umontreal.ca
parents:
42
diff
changeset
|
315 if upper >= self.L and self.n_batches: |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
316 self.next_row -= self.L |
98
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
317 ds_nbatches = (self.L-self.next_row)/self.minibatch_size |
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
318 if self.n_batches is not None: |
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
319 ds_nbatches = min(self.n_batches,ds_nbatches) |
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
320 self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, |
7186e4f502d1
bugfix in DataSet.minibatch to correctly wrap in all corner case.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
95
diff
changeset
|
321 ds_nbatches,self.next_row) |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
322 return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack, |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
323 self.dataset.valuesHStack), |
74
b4159cbdc06b
Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
73
diff
changeset
|
324 minibatch.keys()) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
325 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
326 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
327 minibatches_fieldnames = None |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
328 minibatches_minibatch_size = 1 |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
329 minibatches_n_batches = None |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
330 def minibatches(self, |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
331 fieldnames = minibatches_fieldnames, |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
332 minibatch_size = minibatches_minibatch_size, |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
333 n_batches = minibatches_n_batches, |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
334 offset = 0): |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
335 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
336 Return an iterator that supports three forms of syntax: |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
337 |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
338 for i in dataset.minibatches(None,**kwargs): ... |
16 | 339 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
340 for i in dataset.minibatches([f1, f2, f3],**kwargs): ... |
16 | 341 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
342 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... |
16 | 343 |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
344 Using the first two syntaxes, "i" will be an indexable object, such as a list, |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
345 tuple, or Example instance. In both cases, i[k] is a list-like container |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
346 of a batch of current examples. In the second case, i[0] is |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
347 list-like container of the f1 field of a batch current examples, i[1] is |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
348 a list-like container of the f2 field, etc. |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
349 |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
350 Using the first syntax, all the fields will be returned in "i". |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
351 Using the third syntax, i1, i2, i3 will be list-like containers of the |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
352 f1, f2, and f3 fields of a batch of examples on each loop iteration. |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
353 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
354 The minibatches iterator is expected to return upon each call to next() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
355 a DataSetFields object, which is a Example (indexed by the field names) whose |
80 | 356 elements are iterable and indexable over the minibatch examples, and which keeps a pointer to |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
357 a sub-dataset that can be used to iterate over the individual examples |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
358 in the minibatch. Hence a minibatch can be converted back to a regular |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
359 dataset or its fields can be looked at individually (and possibly iterated over). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
360 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
361 PARAMETERS |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
362 - fieldnames (list of any type, default None): |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
363 The loop variables i1, i2, i3 (in the example above) should contain the |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
364 f1, f2, and f3 fields of the current batch of examples. If None, the |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
365 derived class can choose a default, e.g. all fields. |
16 | 366 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
367 - minibatch_size (integer, default 1) |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
368 On every iteration, the variables i1, i2, i3 will have |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
369 exactly minibatch_size elements. e.g. len(i1) == minibatch_size |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
370 |
331
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
371 @DEPRECATED n_batches : not used anywhere |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
372 - n_batches (integer, default None) |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
373 The iterator will loop exactly this many times, and then stop. If None, |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
374 the derived class can choose a default. If (-1), then the returned |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
375 iterator should support looping indefinitely. |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
376 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
377 - offset (integer, default 0) |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
378 The iterator will start at example 'offset' in the dataset, rather than the default. |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
379 |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
380 Note: A list-like container is something like a tuple, list, numpy.ndarray or |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
381 any other object that supports integer indexing and slicing. |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
382 |
331
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
383 @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
384 batches only, raise StopIteration. |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
385 @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it. |
331
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
386 |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
387 """ |
331
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
388 #return DataSet.MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)\ |
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
389 assert offset >= 0 |
52aa031e1fe3
IMPORTANT: minibatches now returns minibatch_nowrap with a minimum of assert before. Should implement the good behavior, e.g. returning only complete batches and let the user figure out what he wants.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
330
diff
changeset
|
390 assert offset < len(self) |
334
a0f150a33b0f
debug in an assert of minibatches
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
332
diff
changeset
|
391 assert offset + minibatch_size -1 < len(self) |
337
5e38ed2b3a75
debugging when fieldnames = None, now -> fieldnames = fieldNames()
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
335
diff
changeset
|
392 if fieldnames == None : |
5e38ed2b3a75
debugging when fieldnames = None, now -> fieldnames = fieldNames()
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
335
diff
changeset
|
393 fieldnames = self.fieldNames() |
335
89d88807e958
sorry for all the debugging, this push should be the good one (and damn you *self*)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
334
diff
changeset
|
394 return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
395 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
396 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
397 """ |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
398 This is the minibatches iterator generator that sub-classes must define. |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
399 It does not need to worry about wrapping around multiple times across the dataset, |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
400 as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
401 The next() method of the returned iterator does not even need to worry about |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
402 the termination condition (as StopIteration will be raised by DataSet.minibatches |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
403 before an improper call to minibatches_nowrap's next() is made). |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
404 That next() method can assert that its next row will always be within [0,len(dataset)). |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
405 The iterator returned by minibatches_nowrap does not need to implement |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
406 a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
407 """ |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
408 raise AbstractFunction() |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
409 |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
410 def is_unbounded(self): |
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
411 """ |
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
412 Tests whether a dataset is unbounded (e.g. a stream). |
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
413 """ |
123 | 414 return len(self)==maxint |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
415 |
26
672fe4b23032
Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents:
23
diff
changeset
|
416 def hasFields(self,*fieldnames): |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
417 """ |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
418 Return true if the given field name (or field names, if multiple arguments are |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
419 given) is recognized by the DataSet (i.e. can be used as a field name in one |
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
420 of the iterators). |
29
46c5c90019c2
Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents:
28
diff
changeset
|
421 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
422 The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
423 method. Many datasets may store their field names in a dictionary, which would allow more efficiency. |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
424 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
425 return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
426 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
427 def fieldNames(self): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
428 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
429 Return the list of field names that are supported by the iterators, |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
430 and for which hasFields(fieldname) would return True. |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
431 """ |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
432 raise AbstractFunction() |
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
433 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
434 def __call__(self,*fieldnames): |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
435 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
436 Return a dataset that sees only the fields whose name are specified. |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
437 """ |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
438 assert self.hasFields(*fieldnames) |
354
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
439 #return self.fields(*fieldnames).examples() |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
440 fieldnames_list = list(fieldnames) |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
441 return FieldsSubsetDataSet(self,fieldnames_list) |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
442 |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
443 def cached_fields_subset(self,*fieldnames) : |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
444 """ |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
445 Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached. |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
446 @see : dataset.__call__ |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
447 """ |
d580b3a369a4
dataset__call__() returns a FieldsSubsetDataSet, so still a subset of fields, but not cached any more. I added the function dataset.cached_fields_subset(self,*fieldnames) that returns the old version, cached, in case someone needs it. Current behaviour passes the tests.
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
353
diff
changeset
|
448 assert self.hasFields(*fieldnames) |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
449 return self.fields(*fieldnames).examples() |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
450 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
451 def fields(self,*fieldnames): |
29
46c5c90019c2
Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents:
28
diff
changeset
|
452 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
453 Return a DataSetFields object associated with this dataset. |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
454 """ |
74
b4159cbdc06b
Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
73
diff
changeset
|
455 return DataSetFields(self,fieldnames) |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
456 |
269
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
457 def getitem_key(self, fieldname): |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
458 """A not-so-well thought-out place to put code that used to be in |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
459 getitem. |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
460 """ |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
461 #removing as per discussion June 4. --JSB |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
462 |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
463 i = fieldname |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
464 # else check for a fieldname |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
465 if self.hasFields(i): |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
466 return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
467 # else we are trying to access a property of the dataset |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
468 assert i in self.__dict__ # else it means we are trying to access a non-existing property |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
469 return self.__dict__[i] |
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
470 |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
471 def __getitem__(self,i): |
28
541a273bc89f
Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents:
26
diff
changeset
|
472 """ |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
473 @rtype: Example |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
474 @returns: single or multiple examples |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
475 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
476 @type i: integer or slice or <iterable> of integers |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
477 @param i: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
478 dataset[i] returns the (i+1)-th example of the dataset. |
309
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
479 dataset[i:j] returns a LookupList with examples i,i+1,...,j-1. |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
480 dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2. |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
481 dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in. |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
482 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
483 @note: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
484 Some stream datasets may be unable to implement random access, i.e. |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
485 arbitrary slicing/indexing because they can only iterate through |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
486 examples one or a minibatch at a time and do not actually store or keep |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
487 past (or future) examples. |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
488 |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
489 The default implementation of getitem uses the minibatches iterator |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
490 to obtain one example, one slice, or a list of examples. It may not |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
491 always be the most efficient way to obtain the result, especially if |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
492 the data are actually stored in a memory array. |
28
541a273bc89f
Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents:
26
diff
changeset
|
493 """ |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
494 |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
495 if type(i) is int: |
309
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
496 assert i >= 0 # TBM: see if someone complains and want negative i |
313
009ce84e9f52
behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
310
diff
changeset
|
497 if i >= len(self) : |
009ce84e9f52
behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
310
diff
changeset
|
498 raise IndexError |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
499 i_batch = self.minibatches_nowrap(self.fieldNames(), |
293 | 500 minibatch_size=1, n_batches=1, offset=i) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
501 return DataSet.MinibatchToSingleExampleIterator(i_batch).next() |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
502 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
503 #if i is a contiguous slice |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
504 if type(i) is slice and (i.step in (None, 1)): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
505 offset = 0 if i.start is None else i.start |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
506 upper_bound = len(self) if i.stop is None else i.stop |
313
009ce84e9f52
behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
310
diff
changeset
|
507 upper_bound = min(len(self) , upper_bound) |
309
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
508 #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
509 # minibatch_size=upper_bound - offset, |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
510 # n_batches=1, |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
511 # offset=offset).next()) |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
512 # now returns a LookupList |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
513 return self.minibatches_nowrap(self.fieldNames(), |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
514 minibatch_size=upper_bound - offset, |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
515 n_batches=1, |
309
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
516 offset=offset).next() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
517 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
518 # if slice has a step param, convert it to list and handle it with the |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
519 # list code |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
520 if type(i) is slice: |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
521 offset = 0 if i.start is None else i.start |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
522 upper_bound = len(self) if i.stop is None else i.stop |
314
105b54ac8260
bug fixed concerning the slicing, now ds[0:len(ds) + 1000 : 2] is accepted, same a python list
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
313
diff
changeset
|
523 upper_bound = min(len(self) , upper_bound) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
524 i = list(range(offset, upper_bound, i.step)) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
525 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
526 # handle tuples, arrays, lists |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
527 if hasattr(i, '__getitem__'): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
528 for idx in i: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
529 #dis-allow nested slices |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
530 if not isinstance(idx, int): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
531 raise TypeError(idx) |
313
009ce84e9f52
behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
310
diff
changeset
|
532 if idx >= len(self) : |
009ce84e9f52
behaviour is now the same as a list in pylearn, so if len(ds) = 10, ds[10] raise an IndexError, same thing for ds[[1,10]], and ds[0:14:1] returns 10 elements
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
310
diff
changeset
|
533 raise IndexError |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
534 # call back into self.__getitem__ |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
535 examples = [self.minibatches_nowrap(self.fieldNames(), |
293 | 536 minibatch_size=1, n_batches=1, offset=ii).next() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
537 for ii in i] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
538 # re-index the fields in each example by field instead of by example |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
539 field_values = [[] for blah in self.fieldNames()] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
540 for e in examples: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
541 for f,v in zip(field_values, e): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
542 f.append(v) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
543 #build them into a LookupList (a.ka. Example) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
544 zz = zip(self.fieldNames(),field_values) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
545 vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
546 example = Example(self.fieldNames(), vst) |
309
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
547 #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
548 # now returns a LookupList |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
549 return example |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
550 |
923de30457f0
get item now returns LookupLists
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
296
diff
changeset
|
551 # what in the world is i? |
269
fdce496c3b56
deprecating __getitem__[fieldname] syntax
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
268
diff
changeset
|
552 raise TypeError(i, type(i)) |
22
b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
bengioy@esprit.iro.umontreal.ca
parents:
20
diff
changeset
|
553 |
310
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
554 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
555 """ |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
556 Enables the call dataset.subset[a:b:c] that will return a DataSet |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
557 around the examples returned by __getitem__(slice(a,b,c)) |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
558 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
559 @SEE DataSet.__getsubset(self) |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
560 """ |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
561 subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet") |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
562 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
563 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
564 def __getsubset(self) : |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
565 """ |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
566 Enables the call data.subset[a:b:c], returns a DataSet. |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
567 Default implementation is a simple wrap around __getitem__() using MinibatchDataSet. |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
568 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
569 @RETURN DataSet |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
570 @SEE DataSet.subset = property(lambda s : s.__getsubset()) |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
571 """ |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
572 _self = self |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
573 class GetSliceReturnsDataSet(object) : |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
574 def __getitem__(self,slice) : |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
575 return MinibatchDataSet(_self.__getitem__(slice)) |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
576 return GetSliceReturnsDataSet() |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
577 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
578 |
ebccfd05ccd5
dataset.subset implemented
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
309
diff
changeset
|
579 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
580 def valuesHStack(self,fieldnames,fieldvalues): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
581 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
582 Return a value that corresponds to concatenating (horizontally) several field values. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
583 This can be useful to merge some fields. The implementation of this operation is likely |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
584 to involve a copy of the original values. When the values are numpy arrays, the |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
585 result should be numpy.hstack(values). If it makes sense, this operation should |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
586 work as well when each value corresponds to multiple examples in a minibatch |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
587 e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
588 then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
589 The default is to use numpy.hstack for numpy.ndarray values, and a list |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
590 pointing to the original values for other data types. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
591 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
592 all_numpy=True |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
593 for value in fieldvalues: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
594 if not type(value) is numpy.ndarray: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
595 all_numpy=False |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
596 if all_numpy: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
597 return numpy.hstack(fieldvalues) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
598 # the default implementation of horizontal stacking is to put values in a list |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
599 return fieldvalues |
26
672fe4b23032
Fixed dataset errors so that _test_dataset.py works again.
bengioy@grenat.iro.umontreal.ca
parents:
23
diff
changeset
|
600 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
601 def valuesVStack(self,fieldname,values): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
602 """ |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
603 @param fieldname: the name of the field from which the values were taken |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
604 @type fieldname: any type |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
605 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
606 @param values: bits near the beginning or end of the dataset |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
607 @type values: list of minibatches (returned by minibatch_nowrap) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
608 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
609 @return: the concatenation (stacking) of the values |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
610 @rtype: something suitable as a minibatch field |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
611 """ |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
612 rval = [] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
613 for v in values: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
614 rval.extend(v) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
615 return rval |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
616 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
617 def __or__(self,other): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
618 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
619 dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
620 fields of the argument datasets. This only works if they all have the same length. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
621 """ |
135
0d8e721cc63c
Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
134
diff
changeset
|
622 return HStackedDataSet([self,other]) |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
623 |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
624 def __and__(self,other): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
625 """ |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
626 dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
627 (and whose length is the sum of the length of the argument datasets). This only |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
628 works if they all have the same fields. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
629 """ |
135
0d8e721cc63c
Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
134
diff
changeset
|
630 return VStackedDataSet([self,other]) |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
631 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
632 def hstack(datasets): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
633 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
634 hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
635 which is a dataset whose fields list is the concatenation of the fields |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
636 of the individual datasets. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
637 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
638 assert len(datasets)>0 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
639 if len(datasets)==1: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
640 return datasets[0] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
641 return HStackedDataSet(datasets) |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
642 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
643 def vstack(datasets): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
644 """ |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
645 vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
646 which is a dataset which iterates first over the examples of dataset1, then |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
647 over those of dataset2, etc. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
648 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
649 assert len(datasets)>0 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
650 if len(datasets)==1: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
651 return datasets[0] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
652 return VStackedDataSet(datasets) |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
653 |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
654 class FieldsSubsetDataSet(DataSet): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
655 """ |
167 | 656 A sub-class of L{DataSet} that selects a subset of the fields. |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
657 """ |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
658 def __init__(self,src,fieldnames): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
659 self.src=src |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
660 self.fieldnames=fieldnames |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
661 assert src.hasFields(*fieldnames) |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
662 self.valuesHStack = src.valuesHStack |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
663 self.valuesVStack = src.valuesVStack |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
664 |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
665 def __len__(self): return len(self.src) |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
666 |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
667 def fieldNames(self): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
668 return self.fieldnames |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
669 |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
670 def __iter__(self): |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
671 class FieldsSubsetIterator(object): |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
672 def __init__(self,ds): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
673 self.ds=ds |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
674 self.src_iter=ds.src.__iter__() |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
675 self.example=None |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
676 def __iter__(self): return self |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
677 def next(self): |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
678 complete_example = self.src_iter.next() |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
679 if self.example: |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
680 self.example._values=[complete_example[field] |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
681 for field in self.ds.fieldnames] |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
682 else: |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
683 self.example=Example(self.ds.fieldnames, |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
684 [complete_example[field] for field in self.ds.fieldnames]) |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
685 return self.example |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
686 return FieldsSubsetIterator(self) |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
687 |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
688 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
689 assert self.hasFields(*fieldnames) |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
690 return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
691 def dontuse__getitem__(self,i): |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
692 return FieldsSubsetDataSet(self.src[i],self.fieldnames) |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
693 |
328
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
694 class RenamedFieldsDataSet(DataSet): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
695 """ |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
696 A sub-class of L{DataSet} that selects and renames a subset of the fields. |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
697 """ |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
698 def __init__(self,src,src_fieldnames,new_fieldnames): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
699 self.src=src |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
700 self.src_fieldnames=src_fieldnames |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
701 self.new_fieldnames=new_fieldnames |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
702 assert src.hasFields(*src_fieldnames) |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
703 assert len(src_fieldnames)==len(new_fieldnames) |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
704 self.valuesHStack = src.valuesHStack |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
705 self.valuesVStack = src.valuesVStack |
351
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
706 self.lookup_fields = Example(new_fieldnames,src_fieldnames) |
328
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
707 |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
708 def __len__(self): return len(self.src) |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
709 |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
710 def fieldNames(self): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
711 return self.new_fieldnames |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
712 |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
713 def __iter__(self): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
714 class FieldsSubsetIterator(object): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
715 def __init__(self,ds): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
716 self.ds=ds |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
717 self.src_iter=ds.src.__iter__() |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
718 self.example=None |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
719 def __iter__(self): return self |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
720 def next(self): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
721 complete_example = self.src_iter.next() |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
722 if self.example: |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
723 self.example._values=[complete_example[field] |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
724 for field in self.ds.src_fieldnames] |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
725 else: |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
726 self.example=Example(self.ds.new_fieldnames, |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
727 [complete_example[field] |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
728 for field in self.ds.src_fieldnames]) |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
729 return self.example |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
730 return FieldsSubsetIterator(self) |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
731 |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
732 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
733 assert self.hasFields(*fieldnames) |
351
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
734 cursor = Example(fieldnames,[0]*len(fieldnames)) |
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
735 for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset): |
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
736 cursor._values=batch._values |
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
737 yield cursor |
7545207466d4
debugged RenamedFieldsDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
344
diff
changeset
|
738 |
328
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
739 def __getitem__(self,i): |
333
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
740 # return FieldsSubsetDataSet(self.src[i],self.new_fieldnames) |
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
741 complete_example = self.src[i] |
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
742 return Example(self.new_fieldnames, |
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
743 [complete_example[field] |
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
744 for field in self.src_fieldnames]) |
7d2f7b8fe213
bugfix in RenamedDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
332
diff
changeset
|
745 |
328
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
746 |
09140ba68e17
Added untested RenamedFieldsDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
322
diff
changeset
|
747 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
748 class DataSetFields(Example): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
749 """ |
167 | 750 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
751 DataSetFields iterates over fields (like columns of a matrix), and can be understood |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
752 as a transpose of the associated dataset. |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
753 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
754 To iterate over fields, one can do |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
755 * for fields in dataset.fields() |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
756 * for fields in dataset(field1,field2,...).fields() to select a subset of fields |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
757 * for fields in dataset.fields(field1,field2,...) to select a subset of fields |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
758 and each of these fields is iterable over the examples: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
759 * for field_examples in dataset.fields(): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
760 for example_value in field_examples: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
761 ... |
241
ddb88a8e9fd2
If I understand properly, the length of an unbounded stream is sys.maxint
delallea@opale.iro.umontreal.ca
parents:
231
diff
changeset
|
762 but when the dataset is a stream (unbounded length), it is not recommended to do |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
763 such things because the underlying dataset may refuse to access the different fields in |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
764 an unsynchronized ways. Hence the fields() method is illegal for streams, by default. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
765 The result of fields() is a DataSetFields object, which iterates over fields, |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
766 and whose elements are iterable over examples. A DataSetFields object can |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
767 be turned back into a DataSet with its examples() method: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
768 dataset2 = dataset1.fields().examples() |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
769 and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
770 |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
771 DataSetFields can be concatenated vertically or horizontally. To be consistent with |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
772 the syntax used for DataSets, the | concatenates the fields and the & concatenates |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
773 the examples. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
774 """ |
74
b4159cbdc06b
Fixed errors raised by test_dataset
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
73
diff
changeset
|
775 def __init__(self,dataset,fieldnames): |
65
d48eba49a2f4
fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
64
diff
changeset
|
776 original_dataset=dataset |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
777 if not fieldnames: |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
778 fieldnames=dataset.fieldNames() |
274
ed70580f2324
bugfix in FieldSubsetDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
273
diff
changeset
|
779 elif not list(fieldnames)==list(dataset.fieldNames()): |
ed70580f2324
bugfix in FieldSubsetDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
273
diff
changeset
|
780 #we must cast to list, othersize('x','y')!=['x','y'] |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
781 dataset = FieldsSubsetDataSet(dataset,fieldnames) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
782 assert dataset.hasFields(*fieldnames) |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
783 self.dataset=dataset |
66
dde1fb1b63ba
fixed test and removed print
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
65
diff
changeset
|
784 |
64
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
785 if isinstance(dataset,MinibatchDataSet): |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
786 Example.__init__(self,fieldnames,list(dataset._fields)) |
65
d48eba49a2f4
fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
64
diff
changeset
|
787 elif isinstance(original_dataset,MinibatchDataSet): |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
788 Example.__init__(self,fieldnames, |
65
d48eba49a2f4
fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
64
diff
changeset
|
789 [original_dataset._fields[field] |
d48eba49a2f4
fixed the infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
64
diff
changeset
|
790 for field in fieldnames]) |
64
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
791 else: |
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
792 minibatch_iterator = dataset.minibatches(fieldnames, |
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
793 minibatch_size=len(dataset), |
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
794 n_batches=1) |
863da25a60f1
trying to fix infinite loop
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
62
diff
changeset
|
795 minibatch=minibatch_iterator.next() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
796 Example.__init__(self,fieldnames,minibatch) |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
797 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
798 def examples(self): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
799 return self.dataset |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
800 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
801 def __or__(self,other): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
802 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
803 fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
804 of the list of examples of DataSetFields fields1 and fields2. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
805 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
806 return (self.examples() + other.examples()).fields() |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
807 |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
808 def __and__(self,other): |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
809 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
810 fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
811 of the fields of DataSetFields fields1 and fields2. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
812 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
813 return (self.examples() | other.examples()).fields() |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
814 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
815 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
816 class MinibatchDataSet(DataSet): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
817 """ |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
818 Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
819 Each element of the lookup-list should be an iterable and sliceable, all of the same length. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
820 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
821 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
822 values_hstack=DataSet().valuesHStack): |
17
759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
bergstrj@iro.umontreal.ca
diff
changeset
|
823 """ |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
824 The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
825 and a values_hstack(fieldnames,fieldvalues) functions behaving with the same |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
826 semantics as the DataSet methods of the same name (but without the self argument). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
827 """ |
211
bd728c83faff
in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
203
diff
changeset
|
828 |
61
a8b70a9117ad
bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
60
diff
changeset
|
829 self._fields=fields_lookuplist |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
830 assert len(fields_lookuplist)>0 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
831 self.length=len(fields_lookuplist[0]) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
832 for field in fields_lookuplist[1:]: |
223
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
833 if self.length != len(field) : |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
834 print 'self.length = ',self.length |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
835 print 'len(field) = ', len(field) |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
836 print 'self._fields.keys() = ', self._fields.keys() |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
837 print 'field=',field |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
838 print 'fields_lookuplist=', fields_lookuplist |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
839 assert self.length==len(field) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
840 self.valuesVStack=values_vstack |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
841 self.valuesHStack=values_hstack |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
842 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
843 def __len__(self): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
844 return self.length |
28
541a273bc89f
Removed __array__ method from dataset, whose
bengioy@grenat.iro.umontreal.ca
parents:
26
diff
changeset
|
845 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
846 def dontuse__getitem__(self,i): |
80 | 847 if type(i) in (slice,list): |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
848 return DataSetFields(MinibatchDataSet( |
80 | 849 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) |
850 if type(i) is int: | |
85 | 851 return Example(self._fields.keys(),[field[i] for field in self._fields]) |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
852 if self.hasFields(i): |
61
a8b70a9117ad
bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
60
diff
changeset
|
853 return self._fields[i] |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
854 assert i in self.__dict__ # else it means we are trying to access a non-existing property |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
855 return self.__dict__[i] |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
856 |
29
46c5c90019c2
Changed apply_function so that it propagates methods of the source.
bengioy@grenat.iro.umontreal.ca
parents:
28
diff
changeset
|
857 def fieldNames(self): |
61
a8b70a9117ad
bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
60
diff
changeset
|
858 return self._fields.keys() |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
859 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
860 def hasFields(self,*fieldnames): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
861 for fieldname in fieldnames: |
61
a8b70a9117ad
bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
60
diff
changeset
|
862 if fieldname not in self._fields.keys(): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
863 return False |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
864 return True |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
865 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
866 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
223
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
867 #@TODO bug somewhere here, fieldnames doesnt seem to be well handled |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
868 class Iterator(object): |
223
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
869 def __init__(self,ds,fieldnames): |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
870 # tbm: added two next lines to handle fieldnames |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
871 if fieldnames is None: fieldnames = ds._fields.keys() |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
872 self.fieldnames = fieldnames |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
873 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
874 self.ds=ds |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
875 self.next_example=offset |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
876 assert minibatch_size >= 0 |
41 | 877 if offset+minibatch_size > ds.length: |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
878 raise NotImplementedError() |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
879 def __iter__(self): |
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
880 return self |
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
881 def next(self): |
61
a8b70a9117ad
bugfix: in MinibatchDataSet renamed the class variable fields to _fields as parent class have a function called field.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
60
diff
changeset
|
882 upper = self.next_example+minibatch_size |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
883 if upper > len(self.ds) : |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
884 raise StopIteration() |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
885 assert upper<=len(self.ds) # instead of self.ds.length |
223
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
886 #minibatch = Example(self.ds._fields.keys(), |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
887 # [field[self.next_example:upper] |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
888 # for field in self.ds._fields]) |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
889 # tbm: modif to use fieldnames |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
890 values = [] |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
891 for f in self.fieldnames : |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
892 #print 'we have field',f,'in fieldnames' |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
893 values.append( self.ds._fields[f][self.next_example:upper] ) |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
894 minibatch = Example(self.fieldnames,values) |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
895 #print minibatch |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
896 self.next_example+=minibatch_size |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
897 return minibatch |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
898 |
223
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
899 # tbm: added fieldnames to handle subset of fieldnames |
517364d48ae0
should have solved the problem with minibatches not handling subsets of fieldnames, although maybe not super efficient
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
218
diff
changeset
|
900 return Iterator(self,fieldnames) |
20
266c68cb6136
Minor editions, plus adding untested ApplyFunctionDataset for GradientLearner in the works.
bengioy@bengiomac.local
parents:
19
diff
changeset
|
901 |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
902 class HStackedDataSet(DataSet): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
903 """ |
167 | 904 A L{DataSet} that wraps several datasets and shows a view that includes all their fields, |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
905 i.e. whose list of fields is the concatenation of their lists of fields. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
906 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
907 If a field name is found in more than one of the datasets, then either an error is |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
908 raised or the fields are renamed (either by prefixing the __name__ attribute |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
909 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
910 |
167 | 911 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
912 """ |
41 | 913 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): |
914 DataSet.__init__(self,description,field_types) | |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
915 self.datasets=datasets |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
916 self.accept_nonunique_names=accept_nonunique_names |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
917 self.fieldname2dataset={} |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
918 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
919 def rename_field(fieldname,dataset,i): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
920 if hasattr(dataset,"__name__"): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
921 return dataset.__name__ + "." + fieldname |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
922 return fieldname+"."+str(i) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
923 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
924 # make sure all datasets have the same length and unique field names |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
925 self.length=None |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
926 names_to_change=[] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
927 for i in xrange(len(datasets)): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
928 dataset = datasets[i] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
929 length=len(dataset) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
930 if self.length: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
931 assert self.length==length |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
932 else: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
933 self.length=length |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
934 for fieldname in dataset.fieldNames(): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
935 if fieldname in self.fieldname2dataset: # name conflict! |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
936 if accept_nonunique_names: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
937 fieldname=rename_field(fieldname,dataset,i) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
938 names2change.append((fieldname,i)) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
939 else: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
940 raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
941 self.fieldname2dataset[fieldname]=i |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
942 for fieldname,i in names_to_change: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
943 del self.fieldname2dataset[fieldname] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
944 self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
945 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
946 def hasFields(self,*fieldnames): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
947 for fieldname in fieldnames: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
948 if not fieldname in self.fieldname2dataset: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
949 return False |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
950 return True |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
951 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
952 def fieldNames(self): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
953 return self.fieldname2dataset.keys() |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
954 |
41 | 955 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
956 |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
957 class HStackedIterator(object): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
958 def __init__(self,hsds,iterators): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
959 self.hsds=hsds |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
960 self.iterators=iterators |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
961 def __iter__(self): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
962 return self |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
963 def next(self): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
964 # concatenate all the fields of the minibatches |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
965 l=Example() |
140 | 966 for iter in self.iterators: |
967 l.append_lookuplist(iter.next()) | |
968 return l | |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
969 |
125 | 970 assert self.hasFields(*fieldnames) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
971 # find out which underlying datasets are necessary to service the required fields |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
972 # and construct corresponding minibatch iterators |
140 | 973 if fieldnames and fieldnames!=self.fieldNames(): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
974 datasets=set([]) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
975 fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
976 for fieldname in fieldnames: |
136 | 977 dataset=self.datasets[self.fieldname2dataset[fieldname]] |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
978 datasets.add(dataset) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
979 fields_in_dataset[dataset].append(fieldname) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
980 datasets=list(datasets) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
981 iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
982 for dataset in datasets] |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
983 else: |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
984 datasets=self.datasets |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
985 iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
986 return HStackedIterator(self,iterators) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
987 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
988 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
989 def untested_valuesVStack(self,fieldname,fieldvalues): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
990 return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
991 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
992 def untested_valuesHStack(self,fieldnames,fieldvalues): |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
993 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
994 We will use the sub-dataset associated with the first fieldname in the fieldnames list |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
995 to do the work, hoping that it can cope with the other values (i.e. won't care |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
996 about the incompatible fieldnames). Hence this heuristic will always work if |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
997 all the fieldnames are of the same sub-dataset. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
998 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
999 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1000 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1001 class VStackedDataSet(DataSet): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1002 """ |
167 | 1003 A L{DataSet} that wraps several datasets and shows a view that includes all their examples, |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1004 in the order provided. This clearly assumes that they all have the same field names |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1005 and all (except possibly the last one) are of finite length. |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1006 |
167 | 1007 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1008 """ |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1009 def __init__(self,datasets): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1010 self.datasets=datasets |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1011 self.length=0 |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1012 self.index2dataset={} |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1013 assert len(datasets)>0 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1014 fieldnames = datasets[-1].fieldNames() |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1015 self.datasets_start_row=[] |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1016 # We use this map from row index to dataset index for constant-time random access of examples, |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1017 # to avoid having to search for the appropriate dataset each time and slice is asked for. |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1018 for dataset,k in enumerate(datasets[0:-1]): |
48
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
1019 assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). |
b6730f9a336d
Fixing MinibatchDataSet getitem
bengioy@grenat.iro.umontreal.ca
parents:
46
diff
changeset
|
1020 L=len(dataset) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1021 for i in xrange(L): |
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1022 self.index2dataset[self.length+i]=k |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1023 self.datasets_start_row.append(self.length) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1024 self.length+=L |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1025 assert dataset.fieldNames()==fieldnames |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1026 self.datasets_start_row.append(self.length) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1027 self.length+=len(datasets[-1]) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1028 # If length is very large, we should use a more memory-efficient mechanism |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1029 # that does not store all indices |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1030 if self.length>1000000: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1031 # 1 million entries would require about 60 meg for the index2dataset map |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1032 # TODO |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1033 print "A more efficient mechanism for index2dataset should be implemented" |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1034 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1035 def __len__(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1036 return self.length |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1037 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1038 def fieldNames(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1039 return self.datasets[0].fieldNames() |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1040 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1041 def hasFields(self,*fieldnames): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1042 return self.datasets[0].hasFields(*fieldnames) |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1043 |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1044 def locate_row(self,row): |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1045 """Return (dataset_index, row_within_dataset) for global row number""" |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1046 dataset_index = self.index2dataset[row] |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1047 row_within_dataset = self.datasets_start_row[dataset_index] |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1048 return dataset_index, row_within_dataset |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1049 |
41 | 1050 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
1051 |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
1052 class VStackedIterator(object): |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1053 def __init__(self,vsds): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1054 self.vsds=vsds |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1055 self.next_row=offset |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1056 self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1057 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1058 self.next_iterator(vsds.datasets[0],offset,n_batches) |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1059 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1060 def next_iterator(self,dataset,starting_offset,batches_left): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1061 L=len(dataset) |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1062 ds_nbatches = (L-starting_offset)/minibatch_size |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1063 if batches_left is not None: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1064 ds_nbatches = max(batches_left,ds_nbatches) |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1065 if minibatch_size>L: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1066 ds_minibatch_size=L |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1067 n_left_in_mb=minibatch_size-L |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1068 ds_nbatches=1 |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1069 else: |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1070 n_left_in_mb=0 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1071 return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1072 L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1073 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1074 def move_to_next_dataset(self): |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1075 if self.n_left_at_the_end_of_ds>0: |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1076 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1077 self.next_iterator(vsds.datasets[self.next_dataset_index], |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1078 self.n_left_at_the_end_of_ds,1) |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1079 else: |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1080 self.next_dataset_index +=1 |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1081 if self.next_dataset_index==len(self.vsds.datasets): |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1082 self.next_dataset_index = 0 |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1083 self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1084 self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1085 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1086 def __iter__(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1087 return self |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1088 |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1089 def next(self): |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1090 dataset=self.vsds.datasets[self.next_dataset_index] |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1091 mb = self.next_iterator.next() |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1092 if self.n_left_in_mb: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1093 extra_mb = [] |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1094 while self.n_left_in_mb>0: |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1095 self.move_to_next_dataset() |
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1096 extra_mb.append(self.next_iterator.next()) |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1097 mb = Example(fieldnames, |
40
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
1098 [dataset.valuesVStack(name, |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
1099 [mb[name]]+[b[name] for b in extra_mb]) |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
1100 for name in fieldnames]) |
88fd1cce08b9
replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
bengioy@esprit.iro.umontreal.ca
parents:
39
diff
changeset
|
1101 |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1102 self.next_row+=minibatch_size |
38
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1103 self.next_dataset_row+=minibatch_size |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1104 if self.next_row+minibatch_size>len(dataset): |
d637ad8f7352
Finished first untested version of VStackedDataset
bengioy@esprit.iro.umontreal.ca
parents:
37
diff
changeset
|
1105 self.move_to_next_dataset() |
44
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
1106 return examples |
5a85fda9b19b
Fixed some more iterator bugs
bengioy@grenat.iro.umontreal.ca
parents:
43
diff
changeset
|
1107 return VStackedIterator(self) |
37
73c4212ba5b3
Factored the minibatch-writing code into an iterator class inside DataSet
bengioy@esprit.iro.umontreal.ca
parents:
36
diff
changeset
|
1108 |
41 | 1109 class ArrayFieldsDataSet(DataSet): |
1110 """ | |
1111 Virtual super-class of datasets whose field values are numpy array, | |
1112 thus defining valuesHStack and valuesVStack for sub-classes. | |
1113 """ | |
268
3f1cd8897fda
reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
1114 def __init__(self,description=None,field_types=None): |
3f1cd8897fda
reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
1115 DataSet.__init__(self,description,field_types) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1116 def untested_valuesHStack(self,fieldnames,fieldvalues): |
41 | 1117 """Concatenate field values horizontally, e.g. two vectors |
1118 become a longer vector, two matrices become a wider matrix, etc.""" | |
1119 return numpy.hstack(fieldvalues) | |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1120 def untested_valuesVStack(self,fieldname,values): |
41 | 1121 """Concatenate field values vertically, e.g. two vectors |
1122 become a two-row matrix, two matrices become a longer matrix, etc.""" | |
1123 return numpy.vstack(values) | |
1124 | |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1125 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1126 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1127 class NArraysDataSet(ArrayFieldsDataSet) : |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1128 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1129 An NArraysDataSet stores fields that are numpy tensor, whose first axis |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1130 iterates over examples. It's a generalization of ArrayDataSet. |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1131 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1132 #@TODO not completely implemented yet |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1133 def __init__(self, data_arrays, fieldnames, **kwargs) : |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1134 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1135 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1136 of fieldnames. The number of arrays must be the same as the number of |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1137 fieldnames. Each set of numpy tensor must have the same first dimension (first |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1138 axis) corresponding to the number of examples. |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1139 |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1140 Every tensor is treated as a numpy array (using numpy.asarray) |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1141 """ |
318
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1142 ArrayFieldsDataSet.__init__(self,**kwargs) |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1143 assert len(data_arrays) == len(fieldnames) |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1144 assert len(fieldnames) > 0 |
322
ad8be93b3c55
small bugs fixed with NArrayDataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
321
diff
changeset
|
1145 ndarrays = [numpy.asarray(a) for a in data_arrays] |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1146 lens = [a.shape[0] for a in ndarrays] |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1147 num_examples = lens[0] #they must all be equal anyway |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1148 self._fieldnames = fieldnames |
322
ad8be93b3c55
small bugs fixed with NArrayDataSet
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
321
diff
changeset
|
1149 for k in ndarrays : |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1150 assert k.shape[0] == num_examples |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1151 self._datas = ndarrays |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1152 # create dict |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1153 self.map_field_idx = dict() |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1154 for k in range(len(fieldnames)): |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1155 self.map_field_idx[fieldnames[k]] = k |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1156 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1157 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1158 def __len__(self) : |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1159 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1160 Length of the dataset is based on the first array = data_arrays[0], using its shape |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1161 """ |
318
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1162 return self._datas[0].shape[0] |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1163 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1164 def fieldNames(self) : |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1165 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1166 Returns the fieldnames as set in self.__init__ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1167 """ |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1168 return self._fieldnames |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1169 |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1170 def field_pos(self,fieldname) : |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1171 """ |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1172 Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1173 """ |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1174 return self.map_field_idx[fieldname] |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1175 |
318
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1176 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1177 cursor = Example(fieldnames,[0]*len(fieldnames)) |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1178 fieldnames = self.fieldNames() if fieldnames is None else fieldnames |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1179 for n in xrange(n_batches): |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1180 if offset == len(self): |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1181 break |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1182 for f in range(len(cursor._names)) : |
321
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1183 idx = self.field_pos(cursor._names[f]) |
f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
320
diff
changeset
|
1184 sub_data = self._datas[idx][offset : offset+minibatch_size] |
318
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1185 cursor._values[f] = sub_data |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1186 offset += len(sub_data) #can be less than minibatch_size at end |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1187 yield cursor |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1188 |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1189 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1190 |
e2eab74b6a28
NArraysDataSet, a generalization ArrayDataSet where every field is a ndarray, is implemented. Not really tested aside basic stuff...
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
317
diff
changeset
|
1191 |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1192 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1193 |
41 | 1194 class ArrayDataSet(ArrayFieldsDataSet): |
1195 """ | |
1196 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, | |
1197 whose first axis iterates over examples, second axis determines fields. | |
1198 If the underlying array is N-dimensional (has N axes), then the field | |
1199 values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). | |
1200 """ | |
1201 | |
188
f01ac276c6fb
added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
171
diff
changeset
|
1202 def __init__(self, data_array, fields_columns, **kwargs): |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1203 """ |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1204 Construct an ArrayDataSet from the underlying numpy array (data) and |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1205 a map (fields_columns) from fieldnames to field columns. The columns of a field are specified |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1206 using the standard arguments for indexing/slicing: integer for a column index, |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1207 slice for an interval of columns (with possible stride), or iterable of column indices. |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1208 """ |
188
f01ac276c6fb
added __contains__ to Dataset, added parent constructor call to ArrayDataSet
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
171
diff
changeset
|
1209 ArrayFieldsDataSet.__init__(self, **kwargs) |
41 | 1210 self.data=data_array |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1211 self.fields_columns=fields_columns |
41 | 1212 |
1213 # check consistency and complete slices definitions | |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1214 for fieldname, fieldcolumns in self.fields_columns.items(): |
41 | 1215 if type(fieldcolumns) is int: |
1216 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] | |
268
3f1cd8897fda
reverting dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
1217 if 1: |
227
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1218 #I changed this because it didn't make sense to me, |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1219 # and it made it more difficult to write my learner. |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1220 # If it breaks stuff, let's talk about it. |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1221 # - James 22/05/2008 |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1222 self.fields_columns[fieldname]=[fieldcolumns] |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1223 else: |
17c5d080964b
reinstating changeset 216 clobbered accidentally by 218
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
223
diff
changeset
|
1224 self.fields_columns[fieldname]=fieldcolumns |
41 | 1225 elif type(fieldcolumns) is slice: |
1226 start,step=None,None | |
1227 if not fieldcolumns.start: | |
1228 start=0 | |
1229 if not fieldcolumns.step: | |
1230 step=1 | |
1231 if start or step: | |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1232 self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step) |
41 | 1233 elif hasattr(fieldcolumns,"__iter__"): # something like a list |
1234 for i in fieldcolumns: | |
1235 assert i>=0 and i<data_array.shape[1] | |
1236 | |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1237 def fieldNames(self): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1238 return self.fields_columns.keys() |
41 | 1239 |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1240 def __len__(self): |
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1241 return len(self.data) |
41 | 1242 |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1243 def __getitem__(self,key): |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1244 """More efficient implementation than the default __getitem__""" |
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1245 fieldnames=self.fields_columns.keys() |
243
c8f19a9eb10f
Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
242
diff
changeset
|
1246 values=self.fields_columns.values() |
80 | 1247 if type(key) is int: |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1248 return Example(fieldnames, |
243
c8f19a9eb10f
Optimisation in ArrayDataSet::__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
242
diff
changeset
|
1249 [self.data[key,col] for col in values]) |
80 | 1250 if type(key) is slice: |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1251 return Example(fieldnames,[self.data[key,col] for col in values]) |
80 | 1252 if type(key) is list: |
1253 for i in range(len(key)): | |
1254 if self.hasFields(key[i]): | |
1255 key[i]=self.fields_columns[key[i]] | |
316
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1256 return Example(fieldnames, |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1257 #we must separate differently for list as numpy |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1258 # doesn't support self.data[[i1,...],[i2,...]] |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1259 # when their is more then two i1 and i2 |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1260 [self.data[key,:][:,col] |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1261 if isinstance(col,list) else |
5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
314
diff
changeset
|
1262 self.data[key,col] for col in values]) |
80 | 1263 |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1264 # else check for a fieldname |
80 | 1265 if self.hasFields(key): |
105
8c0a1b11b007
bugfix, we keep all the line, but only a some columns
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
101
diff
changeset
|
1266 return self.data[:,self.fields_columns[key]] |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1267 # else we are trying to access a property of the dataset |
80 | 1268 assert key in self.__dict__ # else it means we are trying to access a non-existing property |
1269 return self.__dict__[key] | |
55
66619ce44497
Efficient implementation of getitem for ArrayDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
48
diff
changeset
|
1270 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1271 def dontuse__iter__(self): |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1272 class ArrayDataSetIteratorIter(object): |
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1273 def __init__(self,dataset,fieldnames): |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1274 if fieldnames is None: fieldnames = dataset.fieldNames() |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1275 # store the resulting minibatch in a lookup-list of values |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1276 self.minibatch = Example(fieldnames,[0]*len(fieldnames)) |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1277 self.dataset=dataset |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1278 self.current=0 |
238
ae1d85aca858
optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
235
diff
changeset
|
1279 self.columns = [self.dataset.fields_columns[f] |
ae1d85aca858
optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
235
diff
changeset
|
1280 for f in self.minibatch._names] |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1281 self.l = self.dataset.data.shape[0] |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1282 def __iter__(self): |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1283 return self |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1284 def next(self): |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1285 #@todo: we suppose that we need to stop only when minibatch_size == 1. |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1286 # Otherwise, MinibatchWrapAroundIterator do it. |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1287 if self.current>=self.l: |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1288 raise StopIteration |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1289 sub_data = self.dataset.data[self.current] |
238
ae1d85aca858
optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
235
diff
changeset
|
1290 self.minibatch._values = [sub_data[c] for c in self.columns] |
ae1d85aca858
optimization in ArrayDataSet::__iter__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
235
diff
changeset
|
1291 |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1292 self.current+=1 |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1293 return self.minibatch |
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1294 |
270
1cafd495098c
code cleanup and small optimisation
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
254
diff
changeset
|
1295 return ArrayDataSetIteratorIter(self,self.fieldNames()) |
228
6f55e301c687
optimisation of ArrayDataSet
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
203
diff
changeset
|
1296 |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1297 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1298 cursor = Example(fieldnames,[0]*len(fieldnames)) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1299 fieldnames = self.fieldNames() if fieldnames is None else fieldnames |
339
aa8aff6abbf7
n_minibatches in ArrayDataSet automatically computed
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
337
diff
changeset
|
1300 if n_batches == None: |
aa8aff6abbf7
n_minibatches in ArrayDataSet automatically computed
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
337
diff
changeset
|
1301 n_batches = (len(self) - offset) / minibatch_size |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1302 for n in xrange(n_batches): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1303 if offset == len(self): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1304 break |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1305 sub_data = self.data[offset : offset+minibatch_size] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1306 offset += len(sub_data) #can be less than minibatch_size at end |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1307 cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1308 yield cursor |
42
9b68774fcc6b
Testing basic functionality and removing obvious bugs
bengioy@grenat.iro.umontreal.ca
parents:
41
diff
changeset
|
1309 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1310 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1311 |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1312 |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1313 class CachedDataSet(DataSet): |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1314 """ |
167 | 1315 Wrap a L{DataSet} whose values are computationally expensive to obtain |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1316 (e.g. because they involve some computation, or disk access), |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1317 so that repeated accesses to the same example are done cheaply, |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1318 by caching every example value that has been accessed at least once. |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1319 |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1320 Optionally, for finite-length dataset, all the values can be computed |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1321 (and cached) upon construction of the CachedDataSet, rather at the |
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1322 first access. |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1323 |
167 | 1324 @todo: when cache_all_upon_construction create mini-batches that are as |
77
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1325 large as possible but not so large as to fill up memory. |
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1326 |
167 | 1327 @todo: add disk-buffering capability, so that when the cache becomes too |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1328 big for memory, we cache things on disk, trying to keep in memory only |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1329 the record most likely to be accessed next. |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1330 """ |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1331 def __init__(self,source_dataset,cache_all_upon_construction=False): |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1332 self.source_dataset=source_dataset |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1333 self.cache_all_upon_construction=cache_all_upon_construction |
152
3f627e844cba
Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
151
diff
changeset
|
1334 self.cached_examples = [] |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1335 if cache_all_upon_construction: |
77
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1336 # this potentially brings all the source examples |
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1337 # into memory at once, which may be too much |
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1338 # the work could possibly be done by minibatches |
1e2bb5bad636
toying with different ways to implement learners
bengioy@bengiomac.local
parents:
74
diff
changeset
|
1339 # that are as large as possible but no more than what memory allows. |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1340 # |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1341 # field_values is supposed to be an DataSetFields, that inherits from LookupList |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1342 #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1343 fields_values = DataSetFields(source_dataset,None) |
152
3f627e844cba
Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
151
diff
changeset
|
1344 assert all([len(self)==len(field_values) for field_values in fields_values]) |
3f627e844cba
Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
151
diff
changeset
|
1345 for example in fields_values.examples(): |
171
895b4b60f5e8
bugfix. Otherwise the example was writed over and not a new one was returned
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
167
diff
changeset
|
1346 self.cached_examples.append(copy.copy(example)) |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1347 |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1348 self.fieldNames = source_dataset.fieldNames |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1349 self.hasFields = source_dataset.hasFields |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1350 self.valuesHStack = source_dataset.valuesHStack |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1351 self.valuesVStack = source_dataset.valuesVStack |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1352 |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1353 def __len__(self): |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1354 return len(self.source_dataset) |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1355 |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1356 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1357 class CacheIterator(object): |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1358 def __init__(self,dataset): |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1359 self.dataset=dataset |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1360 self.current=offset |
254
8ec867d12428
optimication in CachedDataSet.minibatches_nowrap
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
253
diff
changeset
|
1361 self.all_fields = self.dataset.fieldNames()==fieldnames |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1362 self.n_batches = n_batches |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1363 self.batch_counter = 0 |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1364 def __iter__(self): return self |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1365 def next(self): |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1366 self.batch_counter += 1 |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1367 if self.n_batches and self.batch_counter > self.n_batches : |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1368 raise StopIteration() |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1369 upper = self.current+minibatch_size |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1370 if upper > len(self.dataset.source_dataset): |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1371 raise StopIteration() |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1372 cache_len = len(self.dataset.cached_examples) |
135
0d8e721cc63c
Fixed bugs in dataset to make test_mlp.py work
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
134
diff
changeset
|
1373 if upper>cache_len: # whole minibatch is not already in cache |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1374 # cache everything from current length to upper |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1375 #for example in self.dataset.source_dataset[cache_len:upper]: |
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1376 for example in self.dataset.source_dataset.subset[cache_len:upper]: |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1377 self.dataset.cached_examples.append(example) |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1378 all_fields_minibatch = Example(self.dataset.fieldNames(), |
152
3f627e844cba
Fixes in CacheDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
151
diff
changeset
|
1379 zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) |
353
47538a45b878
Cached dataset seems debug, using n_batches... is n_batches around to stay?
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
351
diff
changeset
|
1380 |
163
d7d67651d67c
bugfix, we should advence by the minibatch size.
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
159
diff
changeset
|
1381 self.current+=minibatch_size |
254
8ec867d12428
optimication in CachedDataSet.minibatches_nowrap
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
253
diff
changeset
|
1382 if self.all_fields: |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1383 return all_fields_minibatch |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1384 return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1385 return CacheIterator(self) |
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1386 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1387 def dontuse__getitem__(self,i): |
153 | 1388 if type(i)==int and len(self.cached_examples)>i: |
1389 return self.cached_examples[i] | |
1390 else: | |
251
7e6edee187e3
optimization of CachedDataSet__getitem__
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
243
diff
changeset
|
1391 return self.source_dataset[i] |
252
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1392 |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1393 def __iter__(self): |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1394 class CacheIteratorIter(object): |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1395 def __init__(self,dataset): |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1396 self.dataset=dataset |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1397 self.l = len(dataset) |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1398 self.current = 0 |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1399 self.fieldnames = self.dataset.fieldNames() |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1400 self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) |
252
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1401 def __iter__(self): return self |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1402 def next(self): |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1403 if self.current>=self.l: |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1404 raise StopIteration |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1405 cache_len = len(self.dataset.cached_examples) |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1406 if self.current>=cache_len: # whole minibatch is not already in cache |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1407 # cache everything from current length to upper |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1408 self.dataset.cached_examples.append( |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1409 self.dataset.source_dataset[self.current]) |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1410 self.example._values = self.dataset.cached_examples[self.current] |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1411 self.current+=1 |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1412 return self.example |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1413 |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1414 return CacheIteratorIter(self) |
856d14dc4468
implemented CachedDataSet.__iter__ as an optimization
Frederic Bastien <bastienf@iro.umontreal.ca>
parents:
251
diff
changeset
|
1415 |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1416 class ApplyFunctionDataSet(DataSet): |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1417 """ |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1418 A L{DataSet} that contains as fields the results of applying a |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1419 given function example-wise or minibatch-wise to all the fields of |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1420 an input dataset. The output of the function should be an iterable |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1421 (e.g. a list or a LookupList) over the resulting values. |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1422 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1423 The function take as input the fields of the dataset, not the examples. |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1424 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1425 In minibatch mode, the function is expected to work on minibatches |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1426 (takes a minibatch in input and returns a minibatch in output). More |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1427 precisely, it means that each element of the input or output list |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1428 should be iterable and indexable over the individual example values |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1429 (typically these elements will be numpy arrays). All of the elements |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1430 in the input and output lists should have the same length, which is |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1431 the length of the minibatch. |
57
1aabd2e2bb5f
Added empty classes with doc: CachedDataSet and ApplyFunctionDataSet
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
56
diff
changeset
|
1432 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1433 The function is applied each time an example or a minibatch is accessed. |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1434 To avoid re-doing computation, wrap this dataset inside a CachedDataSet. |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1435 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1436 If the values_{h,v}stack functions are not provided, then |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1437 the input_dataset.values{H,V}Stack functions are used by default. |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1438 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1439 """ |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1440 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1441 def __init__(self,input_dataset,function,output_names,minibatch_mode=True, |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1442 values_hstack=None,values_vstack=None, |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1443 description=None,fieldtypes=None): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1444 """ |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1445 Constructor takes an input dataset that has as many fields as the function |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1446 expects as inputs. The resulting dataset has as many fields as the function |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1447 produces as outputs, and that should correspond to the number of output names |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1448 (provided in a list). |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1449 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1450 Note that the expected semantics of the function differs in minibatch mode |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1451 (it takes minibatches of inputs and produces minibatches of outputs, as |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1452 documented in the class comment). |
211
bd728c83faff
in __get__, problem if the i.stop was None, i being the slice, added one line replacing None by the len(self)
Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
parents:
203
diff
changeset
|
1453 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1454 TBM: are filedtypes the old field types (from input_dataset) or the new ones |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1455 (for the new dataset created)? |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1456 """ |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1457 self.input_dataset=input_dataset |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1458 self.function=function |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1459 self.output_names=output_names |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1460 self.minibatch_mode=minibatch_mode |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1461 DataSet.__init__(self,description,fieldtypes) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1462 self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1463 self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1464 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1465 def __len__(self): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1466 return len(self.input_dataset) |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1467 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1468 def fieldNames(self): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1469 return self.output_names |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1470 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1471 def minibatches_nowrap(self, fieldnames, *args, **kwargs): |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1472 all_input_fieldNames = self.input_dataset.fieldNames() |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1473 mbnw = self.input_dataset.minibatches_nowrap |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1474 |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1475 for input_fields in mbnw(all_input_fieldNames, *args, **kwargs): |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1476 if self.minibatch_mode: |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1477 all_output_fields = self.function(*input_fields) |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1478 else: |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1479 input_examples = zip(*input_fields) #makes so that [i] means example i |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1480 output_examples = [self.function(*input_example) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1481 for input_example in input_examples] |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1482 all_output_fields = zip(*output_examples) |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1483 |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1484 all_outputs = Example(self.output_names, all_output_fields) |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1485 #print 'input_fields', input_fields |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1486 #print 'all_outputs', all_outputs |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1487 if fieldnames==self.output_names: |
293 | 1488 rval = all_outputs |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1489 else: |
293 | 1490 rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) |
296
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1491 #print 'rval', rval |
f5d33f9c0b9c
ApplyFunctionDataSet passing
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
293
diff
changeset
|
1492 #print '--------' |
293 | 1493 yield rval |
73
69f97aad3faf
Coded untested ApplyFunctionDataSet and CacheDataSet
bengioy@bengiomac.local
parents:
72
diff
changeset
|
1494 |
290
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1495 def untested__iter__(self): # only implemented for increased efficiency |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1496 class ApplyFunctionSingleExampleIterator(object): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1497 def __init__(self,output_dataset): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1498 self.current=0 |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1499 self.output_dataset=output_dataset |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1500 self.input_iterator=output_dataset.input_dataset.__iter__() |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1501 def __iter__(self): return self |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1502 def next(self): |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1503 if self.output_dataset.minibatch_mode: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1504 function_inputs = [[input] for input in self.input_iterator.next()] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1505 outputs = self.output_dataset.function(*function_inputs) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1506 assert all([hasattr(output,'__iter__') for output in outputs]) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1507 function_outputs = [output[0] for output in outputs] |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1508 else: |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1509 function_inputs = self.input_iterator.next() |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1510 function_outputs = self.output_dataset.function(*function_inputs) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1511 return Example(self.output_dataset.output_names,function_outputs) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1512 return ApplyFunctionSingleExampleIterator(self) |
9b533cc7874a
trying to get default implemenations to work
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
274
diff
changeset
|
1513 |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1514 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1515 """ |
167 | 1516 Wraps an arbitrary L{DataSet} into one for supervised learning tasks |
1517 by forcing the user to define a set of fields as the 'input' field | |
1518 and a set of fields as the 'target' field. Optionally, a single | |
1519 weight_field can also be defined. | |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1520 """ |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1521 args = ((input_fields,'input'),(output_fields,'target')) |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1522 if weight_field: args+=(([weight_field],'weight')) |
36
438440ba0627
Rewriting dataset.py completely
bengioy@zircon.iro.umontreal.ca
parents:
29
diff
changeset
|
1523 return src_dataset.merge_fields(*args) |
23
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1524 |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1525 |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1526 |
526e192b0699
Working on ApplyFunctionDataSet, added constraint that
bengioy@esprit.iro.umontreal.ca
parents:
22
diff
changeset
|
1527 |