Mercurial > pylearn
annotate dataset.py @ 16:813723310d75
commenting
author | bergstrj@iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 18:23:44 -0400 |
parents | 88168361a5ab be128b9127c8 |
children | 759d17112b23 |
rev | line source |
---|---|
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
1 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
2 class Example(object): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
3 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
4 An example is something that is like a tuple but whose elements can be named, to that |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
5 following syntactic constructions work as one would expect: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
6 example.x = [1, 2, 3] # set a field |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
7 x, y, z = example |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
8 x = example[0] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
9 x = example["x"] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
10 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
11 def __init__(self,names,values): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
12 assert len(values)==len(names) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
13 self.__dict__['values']=values |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
14 self.__dict__['fields']={} |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
15 for i in xrange(len(values)): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
16 self.fields[names[i]]=i |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
17 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
18 def __getitem__(self,i): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
19 if isinstance(i,int): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
20 return self.values[i] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
21 else: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
22 return self.values[self.fields[i]] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
23 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
24 def __setitem__(self,i,value): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
25 if isinstance(i,int): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
26 self.values[i]=value |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
27 else: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
28 self.values[self.fields[i]]=value |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
29 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
30 def __getattr__(self,name): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
31 return self.values[self.fields[name]] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
32 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
33 def __setattr__(self,name,value): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
34 self.values[self.fields[name]]=value |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
35 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
36 def __len__(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
37 return len(self.values) |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
38 |
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
39 |
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
40 class DataSet(object): |
16 | 41 """A virtual base class for datasets. |
42 | |
43 A DataSet is a generator of iterators; these iterators can run through the | |
44 examples in a variety of ways. A DataSet need not necessarily have a finite | |
45 or known length, so this class can be used to interface to a 'stream' which | |
46 feed on-line learning. | |
47 | |
48 To iterate over examples, there are several possibilities: | |
49 - for i in dataset.zip(field1, field2,field3, ...) | |
50 - for i in dataset.minibatches(N, field1, field2, ...) | |
51 - for i in dataset | |
52 Each of these is documented below. | |
53 | |
54 Note: For a dataset of fixed and known length, which can implement item | |
55 random-access efficiently (e.g. indexing and slicing), and which can profit | |
56 from the FiniteDataSetIterator, consider using base class FiniteDataSet. | |
57 | |
58 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. | |
59 | |
60 Note: The content of a field can be of any type. | |
61 | |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
62 """ |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
63 |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
64 def __init__(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
65 pass |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
66 |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
67 def __iter__(self): |
16 | 68 """Supports the syntax "for i in dataset: ..." |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
69 |
16 | 70 Using this syntax, "i" will be an Example instance (or equivalent) with |
71 all the fields of DataSet self. Every field of "i" will give access to | |
72 a the field of a single example. Fields should be accessible via | |
73 i[identifier], but the derived class is free to accept any type of | |
74 identifier, and add extra functionality to the iterator. | |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
75 """ |
16 | 76 raise NotImplementedError |
77 | |
78 def zip(self, *fieldnames): | |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
79 """ |
16 | 80 Supports two forms of syntax: |
81 | |
82 for i in dataset.zip(f1, f2, f3): ... | |
83 | |
84 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | |
85 | |
86 Using the first syntax, "i" will be an indexable object, such as a list, | |
87 tuple, or Example instance, such that on every iteration, i[0] is the f1 | |
88 field of the current example, i[1] is the f2 field, and so on. | |
89 | |
90 Using the second syntax, i1, i2, i3 will contain the the contents of the | |
91 f1, f2, and f3 fields of a single example on each loop iteration. | |
92 | |
93 The derived class may accept fieldname arguments of any type. | |
94 | |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
95 """ |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
96 raise NotImplementedError |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
97 |
16 | 98 def minibatches(self,minibatch_size,*fieldnames): |
99 """ | |
100 Supports two forms of syntax: | |
101 | |
102 for i in dataset.zip(f1, f2, f3): ... | |
103 | |
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | |
105 | |
106 Using the first syntax, "i" will be an indexable object, such as a list, | |
107 tuple, or Example instance, such that on every iteration, i[0] is the f1 | |
108 field of the current example, i[1] is the f2 field, and so on. | |
109 | |
110 Using the second syntax, i1, i2, i3 will contain the the contents of the | |
111 f1, f2, and f3 fields of a single example on each loop iteration. | |
112 | |
113 The derived class may accept fieldname arguments of any type. | |
114 | |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
115 Return an iterator, whose next() method returns the next example or the next |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
117 whose fields should be something one can iterate on again in order to obtain |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
118 the individual examples. |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
119 |
16 | 120 DataSet.zip returns an iterator over only the desired fields, and each field |
121 of the iterator contains one example. | |
122 | |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
123 Return an iterator which sees only the specified fields (each fieldname is a |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
124 field key, typically a string). The value returned at each iteration |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
125 is a tuple with one element per field. Hence it can be used like this: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
126 for f1, f2, f3 in dataset.zip('field1','field2','field3'): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
127 ... use f1, f2, and f3 |
16 | 128 If one iterates through minibatches of examples (with the minibatches() method |
129 or with the minibatch_size argument of the zip() method), then the fields | |
130 returned by the iterator's next method should be iterators over the | |
131 individual values within the minibatch (typically these will be arrays | |
132 with minibatch_size rows). | |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
133 Similar to zip but iterate over minibatches. |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
134 Return a minibatch iterator, whose next() method returns an 'example' |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
135 whose fields are iteratable objects (which can iterate over the individual |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
136 values of that field in the minibatch). |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
137 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
138 raise NotImplementedError |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
139 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
140 def fieldNames(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
141 """Return the list of field names in the examples of this dataset.""" |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
142 raise NotImplementedError |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
143 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
144 def rename(*new_field_specifications): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
145 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
146 Return a new dataset that maps old fields (of self) to new fields (of the returned |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
147 dataset). The minimal syntax that should be supported is the following: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
149 new_field_spec = ([old_field1, old_field2, ...], new_field) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
150 In general both old_field and new_field should be strings, but some datasets may also |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
151 support additional indexing schemes within each field (e.g. column slice |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
152 of a matrix-like field). |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
153 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
154 raise NotImplementedError |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
155 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
156 class FiniteDataSet(DataSet): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
157 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
159 Examples are indexed by an integer between 0 and self.length()-1, |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
160 and a subdataset can be obtained by slicing. This may not be appropriate in general |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
161 but only for datasets which can be thought of like ones that access rows AND fields |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
162 in an efficient random access way. Users are encouraged to expect only the generic dataset |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
163 interface in general. A FiniteDataSet is mainly useful when one has to obtain |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
164 a subset of examples (e.g. for splitting a dataset into training and test sets). |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
165 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
166 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
167 def __init__(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
168 pass |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
169 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
170 def __iter__(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
171 return FiniteDataSetIterator(self) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
172 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
173 def zip(self,*fieldnames): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
174 return FiniteDataSetIterator(self,1,fieldnames) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
175 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
176 def minibatches(self,minibatch_size,*fieldnames): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
178 |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
179 def __getattr__(self,fieldname): |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
180 """Return an that can iterate over the values of the field in this dataset.""" |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
181 return self(fieldname) |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
182 |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
183 def __call__(self,*fieldnames): |
16 | 184 """Return a sub-dataset containing only the given fieldnames as fields. |
185 | |
186 The return value's default iterator will iterate only over the given | |
187 fields. | |
188 """ | |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
189 raise NotImplementedError |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
190 |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
191 def __len__(self): |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
192 """len(dataset) returns the number of examples in the dataset.""" |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
193 raise NotImplementedError |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
194 |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
195 def __getitem__(self,i): |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
196 """dataset[i] returns the (i+1)-th example of the dataset.""" |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
197 raise NotImplementedError |
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
198 |
2
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
199 def __getslice__(self,*slice_args): |
3fddb1c8f955
Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents:
1
diff
changeset
|
200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
201 raise NotImplementedError |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
202 |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
203 class FiniteDataSetIterator(object): |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
204 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
205 If the fieldnames list is empty, it means that we want to see ALL the fields. |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
206 """ |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]): |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
208 self.dataset=dataset |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
209 self.minibatch_size=minibatch_size |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
210 assert minibatch_size>=1 and minibatch_size<=len(dataset) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
211 self.current = -self.minibatch_size |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
212 self.fieldnames = fieldnames |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
213 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
214 def __iter__(self): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
215 return self |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
216 |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
217 def next(self): |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
218 self.current+=self.minibatch_size |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
219 if self.current>=len(self.dataset): |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
220 self.current=-self.minibatch_size |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
221 raise StopIteration |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
222 if self.minibatch_size==1: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
223 complete_example=self.dataset[self.current] |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
224 else: |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
225 complete_example=self.dataset[self.current:self.current+self.minibatch_size] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
226 if self.fieldnames: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
227 return Example(self.fieldnames,list(complete_example)) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
228 else: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
229 return complete_example |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
230 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
231 |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
232 # we may want ArrayDataSet defined in another python file |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
233 |
4 | 234 import numpy |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
235 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
236 class ArrayDataSet(FiniteDataSet): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
237 """ |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
239 from DataSet (and the ability to view multiple field values as an 'Example'). |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
240 It is a fixed-length and fixed-width dataset |
9
de616c423dbd
Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents:
8
diff
changeset
|
241 in which each element is a numpy array or a number, hence the whole |
de616c423dbd
Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents:
8
diff
changeset
|
242 dataset corresponds to a numpy array. Fields |
de616c423dbd
Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents:
8
diff
changeset
|
243 must correspond to a slice of array columns. If the dataset has fields, |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
9
de616c423dbd
Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents:
8
diff
changeset
|
245 Any dataset can also be converted to a numpy array (losing the notion of fields |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
246 by the numpy.array(dataset) call. |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
247 """ |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
248 |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
249 def __init__(self,dataset=None,data=None,fields={}): |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
250 """ |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
251 There are two ways to construct an ArrayDataSet: (1) from an |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
252 existing dataset (which may result in a copy of the data in a numpy array), |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
253 or (2) from a numpy.array (the data argument), along with an optional description |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
254 of the fields (dictionary of column slices indexed by field names). |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
255 """ |
4 | 256 if dataset!=None: |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
257 assert data==None and fields=={} |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
258 # Make ONE big minibatch with all the examples, to separate the fields. |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
259 n_examples=len(dataset) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
260 batch = dataset.minibatches(n_examples).next() |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
262 # currently just double, but should use the smallest compatible dtype |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
263 n_fields = len(batch) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
264 fieldnames = batch.fields.keys() |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
265 total_width = 0 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
266 type = None |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
267 for i in xrange(n_fields): |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
268 field = array(batch[i]) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
269 assert field.shape[0]==n_examples |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
270 width = field.shape[1] |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
271 start=total_width |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
272 total_width += width |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
273 fields[fieldnames[i]]=slice(start,total_width,1) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
274 # many complicated things remain to be done: |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
275 # - find common dtype |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
276 # - decide what to do with extra dimensions if not the same in all fields |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
277 # - try to see if we can avoid the copy? |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
278 raise NotImplementedError |
4 | 279 if data!=None: |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
280 assert dataset==None |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
281 self.data=data |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
282 self.fields=fields |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
283 self.width = data.shape[1] |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
284 for fieldname in fields: |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
285 fieldslice=fields[fieldname] |
4 | 286 # make sure fieldslice.start and fieldslice.step are defined |
287 start=fieldslice.start | |
288 step=fieldslice.step | |
289 if not start: | |
290 start=0 | |
291 if not step: | |
292 step=1 | |
293 if not fieldslice.start or not fieldslice.step: | |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
4 | 295 # and coherent with the data array |
296 assert fieldslice.start>=0 and fieldslice.stop<=self.width | |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
297 |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
298 def __getattr__(self,fieldname): |
4 | 299 """ |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
300 Return a numpy array with the content associated with the given field name. |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
302 than the dataset itself) is returned. |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
303 """ |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
304 if len(self.data)==1: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
305 return self.data[0,self.fields[fieldname]] |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
306 return self.data[:,self.fields[fieldname]] |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
307 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
308 def __call__(self,*fieldnames): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
309 """Return a sub-dataset containing only the given fieldnames as fields.""" |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
310 min_col=self.data.shape[1] |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
311 max_col=0 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
312 for field_slice in self.fields.values(): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
313 min_col=min(min_col,field_slice.start) |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
314 max_col=max(max_col,field_slice.stop) |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
315 new_fields={} |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
316 for field in self.fields: |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
319 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
320 def fieldNames(self): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
321 """Return the list of field names that are supported by getattr and getFields.""" |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
322 return self.fields.keys() |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
323 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
324 def __len__(self): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
325 """len(dataset) returns the number of examples in the dataset.""" |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
326 return len(self.data) |
1
2cd82666b9a7
Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents:
0
diff
changeset
|
327 |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
328 def __getitem__(self,i): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
329 """ |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
331 the result is just a numpy array (for the i-th row of the dataset data matrix). |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
332 """ |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
333 if self.fields: |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
334 fieldnames,fieldslices=zip(*self.fields.items()) |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
336 else: |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
337 return self.data[i] |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
338 |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
339 def __getslice__(self,*slice_args): |
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
6
d5738b79089a
Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents:
5
diff
changeset
|
341 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) |
3
378b68d5c4ad
Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents:
2
diff
changeset
|
342 |
8
d1c394486037
Replaced asarray() method by __array__ method which gets called automatically when
bengioy@bengiomac.local
parents:
7
diff
changeset
|
343 def __array__(self): |
15 | 344 """Return an view of this dataset which is an numpy.ndarray |
345 | |
346 Numpy uses this special function name to retrieve an ndarray view for | |
347 function such as numpy.sum, numpy.dot, numpy.asarray, etc. | |
348 | |
349 If this dataset has no fields, then we simply return self.data, | |
350 otherwise things are complicated. | |
351 - why do we want this behaviour when there are fields? (JB) | |
352 """ | |
7
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
353 if not self.fields: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
354 return self.data |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
355 # else, select subsets of columns mapped by the fields |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
356 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
357 for field_slice in self.fields.values(): |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
358 for c in xrange(field_slice.start,field_slice.stop,field_slice.step): |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
359 columns_used[c]=True |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
360 # try to figure out if we can map all the slices into one slice: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
361 mappable_to_one_slice = True |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
362 start=0 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
363 while start<len(columns_used) and not columns_used[start]: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
364 start+=1 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
365 stop=len(columns_used) |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
366 while stop>0 and not columns_used[stop-1]: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
367 stop-=1 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
368 step=0 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
369 i=start |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
370 while i<stop: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
371 j=i+1 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
372 while j<stop and not columns_used[j]: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
373 j+=1 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
374 if step: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
375 if step!=j-i: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
376 mappable_to_one_slice = False |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
377 break |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
378 else: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
379 step = j-i |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
380 i=j |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
381 if mappable_to_one_slice: |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
382 return self.data[:,slice(start,stop,step)] |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
383 # else make contiguous copy |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
384 n_columns = sum(columns_used) |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
385 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
386 print result.shape |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
387 c=0 |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
388 for field_slice in self.fields.values(): |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
389 slice_width=field_slice.stop-field_slice.start/field_slice.step |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
390 # copy the field here |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
391 result[:,slice(c,slice_width)]=self.data[:,field_slice] |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
392 c+=slice_width |
6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents:
6
diff
changeset
|
393 return result |
11
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
394 |
be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents:
9
diff
changeset
|
395 |