annotate dataset.py @ 16:813723310d75

commenting
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 18:23:44 -0400
parents 88168361a5ab be128b9127c8
children 759d17112b23
rev   line source
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
1
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
2 class Example(object):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
3 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
4 An example is something that is like a tuple but whose elements can be named, to that
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
5 following syntactic constructions work as one would expect:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
6 example.x = [1, 2, 3] # set a field
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
7 x, y, z = example
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
8 x = example[0]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
9 x = example["x"]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
10 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
11 def __init__(self,names,values):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
12 assert len(values)==len(names)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
13 self.__dict__['values']=values
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
14 self.__dict__['fields']={}
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
15 for i in xrange(len(values)):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
16 self.fields[names[i]]=i
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
17
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
18 def __getitem__(self,i):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
19 if isinstance(i,int):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
20 return self.values[i]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
21 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
22 return self.values[self.fields[i]]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
23
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
24 def __setitem__(self,i,value):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
25 if isinstance(i,int):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
26 self.values[i]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
27 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
28 self.values[self.fields[i]]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
29
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
30 def __getattr__(self,name):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
31 return self.values[self.fields[name]]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
32
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
33 def __setattr__(self,name,value):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
34 self.values[self.fields[name]]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
35
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
36 def __len__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
37 return len(self.values)
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
38
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
39
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
40 class DataSet(object):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
41 """A virtual base class for datasets.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
42
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
43 A DataSet is a generator of iterators; these iterators can run through the
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
44 examples in a variety of ways. A DataSet need not necessarily have a finite
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
45 or known length, so this class can be used to interface to a 'stream' which
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
46 feed on-line learning.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
47
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
48 To iterate over examples, there are several possibilities:
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
49 - for i in dataset.zip(field1, field2,field3, ...)
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
50 - for i in dataset.minibatches(N, field1, field2, ...)
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
51 - for i in dataset
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
52 Each of these is documented below.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
53
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
54 Note: For a dataset of fixed and known length, which can implement item
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
55 random-access efficiently (e.g. indexing and slicing), and which can profit
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
56 from the FiniteDataSetIterator, consider using base class FiniteDataSet.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
57
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
58 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
59
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
60 Note: The content of a field can be of any type.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
61
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
62 """
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
63
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
64 def __init__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
65 pass
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
66
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
67 def __iter__(self):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
68 """Supports the syntax "for i in dataset: ..."
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
69
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
70 Using this syntax, "i" will be an Example instance (or equivalent) with
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
71 all the fields of DataSet self. Every field of "i" will give access to
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
72 a the field of a single example. Fields should be accessible via
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
73 i[identifier], but the derived class is free to accept any type of
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
74 identifier, and add extra functionality to the iterator.
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
75 """
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
76 raise NotImplementedError
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
77
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
78 def zip(self, *fieldnames):
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
79 """
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
80 Supports two forms of syntax:
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
81
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
82 for i in dataset.zip(f1, f2, f3): ...
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
83
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
84 for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
85
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
86 Using the first syntax, "i" will be an indexable object, such as a list,
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
87 tuple, or Example instance, such that on every iteration, i[0] is the f1
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
88 field of the current example, i[1] is the f2 field, and so on.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
89
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
90 Using the second syntax, i1, i2, i3 will contain the the contents of the
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
91 f1, f2, and f3 fields of a single example on each loop iteration.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
92
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
93 The derived class may accept fieldname arguments of any type.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
94
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
95 """
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
96 raise NotImplementedError
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
97
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
98 def minibatches(self,minibatch_size,*fieldnames):
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
99 """
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
100 Supports two forms of syntax:
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
101
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
102 for i in dataset.zip(f1, f2, f3): ...
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
103
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
105
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
106 Using the first syntax, "i" will be an indexable object, such as a list,
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
107 tuple, or Example instance, such that on every iteration, i[0] is the f1
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
108 field of the current example, i[1] is the f2 field, and so on.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
109
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
110 Using the second syntax, i1, i2, i3 will contain the the contents of the
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
111 f1, f2, and f3 fields of a single example on each loop iteration.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
112
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
113 The derived class may accept fieldname arguments of any type.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
114
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
115 Return an iterator, whose next() method returns the next example or the next
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
117 whose fields should be something one can iterate on again in order to obtain
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
118 the individual examples.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
119
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
120 DataSet.zip returns an iterator over only the desired fields, and each field
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
121 of the iterator contains one example.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
122
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
123 Return an iterator which sees only the specified fields (each fieldname is a
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
124 field key, typically a string). The value returned at each iteration
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
125 is a tuple with one element per field. Hence it can be used like this:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
126 for f1, f2, f3 in dataset.zip('field1','field2','field3'):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
127 ... use f1, f2, and f3
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
128 If one iterates through minibatches of examples (with the minibatches() method
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
129 or with the minibatch_size argument of the zip() method), then the fields
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
130 returned by the iterator's next method should be iterators over the
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
131 individual values within the minibatch (typically these will be arrays
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
132 with minibatch_size rows).
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
133 Similar to zip but iterate over minibatches.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
134 Return a minibatch iterator, whose next() method returns an 'example'
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
135 whose fields are iteratable objects (which can iterate over the individual
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
136 values of that field in the minibatch).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
137 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
138 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
139
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
140 def fieldNames(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
141 """Return the list of field names in the examples of this dataset."""
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
142 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
143
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
144 def rename(*new_field_specifications):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
145 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
146 Return a new dataset that maps old fields (of self) to new fields (of the returned
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
147 dataset). The minimal syntax that should be supported is the following:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
149 new_field_spec = ([old_field1, old_field2, ...], new_field)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
150 In general both old_field and new_field should be strings, but some datasets may also
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
151 support additional indexing schemes within each field (e.g. column slice
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
152 of a matrix-like field).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
153 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
154 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
155
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
156 class FiniteDataSet(DataSet):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
157 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
159 Examples are indexed by an integer between 0 and self.length()-1,
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
160 and a subdataset can be obtained by slicing. This may not be appropriate in general
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
161 but only for datasets which can be thought of like ones that access rows AND fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
162 in an efficient random access way. Users are encouraged to expect only the generic dataset
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
163 interface in general. A FiniteDataSet is mainly useful when one has to obtain
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
164 a subset of examples (e.g. for splitting a dataset into training and test sets).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
165 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
166
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
167 def __init__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
168 pass
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
169
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
170 def __iter__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
171 return FiniteDataSetIterator(self)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
172
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
173 def zip(self,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
174 return FiniteDataSetIterator(self,1,fieldnames)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
175
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
176 def minibatches(self,minibatch_size,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
178
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
179 def __getattr__(self,fieldname):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
180 """Return an that can iterate over the values of the field in this dataset."""
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
181 return self(fieldname)
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
182
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
183 def __call__(self,*fieldnames):
16
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
184 """Return a sub-dataset containing only the given fieldnames as fields.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
185
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
186 The return value's default iterator will iterate only over the given
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
187 fields.
813723310d75 commenting
bergstrj@iro.umontreal.ca
parents: 15 11
diff changeset
188 """
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
189 raise NotImplementedError
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
190
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
191 def __len__(self):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
192 """len(dataset) returns the number of examples in the dataset."""
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
193 raise NotImplementedError
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
194
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
195 def __getitem__(self,i):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
196 """dataset[i] returns the (i+1)-th example of the dataset."""
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
197 raise NotImplementedError
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
198
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
199 def __getslice__(self,*slice_args):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
201 raise NotImplementedError
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
202
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
203 class FiniteDataSetIterator(object):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
204 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
205 If the fieldnames list is empty, it means that we want to see ALL the fields.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
206 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
208 self.dataset=dataset
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
209 self.minibatch_size=minibatch_size
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
210 assert minibatch_size>=1 and minibatch_size<=len(dataset)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
211 self.current = -self.minibatch_size
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
212 self.fieldnames = fieldnames
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
213
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
214 def __iter__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
215 return self
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
216
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
217 def next(self):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
218 self.current+=self.minibatch_size
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
219 if self.current>=len(self.dataset):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
220 self.current=-self.minibatch_size
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
221 raise StopIteration
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
222 if self.minibatch_size==1:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
223 complete_example=self.dataset[self.current]
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
224 else:
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
225 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
226 if self.fieldnames:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
227 return Example(self.fieldnames,list(complete_example))
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
228 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
229 return complete_example
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
230
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
231
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
232 # we may want ArrayDataSet defined in another python file
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
233
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
234 import numpy
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
235
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
236 class ArrayDataSet(FiniteDataSet):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
237 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
239 from DataSet (and the ability to view multiple field values as an 'Example').
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
240 It is a fixed-length and fixed-width dataset
9
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
241 in which each element is a numpy array or a number, hence the whole
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
242 dataset corresponds to a numpy array. Fields
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
243 must correspond to a slice of array columns. If the dataset has fields,
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
9
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
245 Any dataset can also be converted to a numpy array (losing the notion of fields
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
246 by the numpy.array(dataset) call.
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
247 """
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
248
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
249 def __init__(self,dataset=None,data=None,fields={}):
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
250 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
251 There are two ways to construct an ArrayDataSet: (1) from an
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
252 existing dataset (which may result in a copy of the data in a numpy array),
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
253 or (2) from a numpy.array (the data argument), along with an optional description
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
254 of the fields (dictionary of column slices indexed by field names).
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
255 """
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
256 if dataset!=None:
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
257 assert data==None and fields=={}
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
258 # Make ONE big minibatch with all the examples, to separate the fields.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
259 n_examples=len(dataset)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
260 batch = dataset.minibatches(n_examples).next()
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
262 # currently just double, but should use the smallest compatible dtype
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
263 n_fields = len(batch)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
264 fieldnames = batch.fields.keys()
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
265 total_width = 0
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
266 type = None
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
267 for i in xrange(n_fields):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
268 field = array(batch[i])
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
269 assert field.shape[0]==n_examples
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
270 width = field.shape[1]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
271 start=total_width
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
272 total_width += width
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
273 fields[fieldnames[i]]=slice(start,total_width,1)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
274 # many complicated things remain to be done:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
275 # - find common dtype
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
276 # - decide what to do with extra dimensions if not the same in all fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
277 # - try to see if we can avoid the copy?
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
278 raise NotImplementedError
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
279 if data!=None:
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
280 assert dataset==None
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
281 self.data=data
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
282 self.fields=fields
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
283 self.width = data.shape[1]
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
284 for fieldname in fields:
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
285 fieldslice=fields[fieldname]
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
286 # make sure fieldslice.start and fieldslice.step are defined
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
287 start=fieldslice.start
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
288 step=fieldslice.step
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
289 if not start:
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
290 start=0
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
291 if not step:
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
292 step=1
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
293 if not fieldslice.start or not fieldslice.step:
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
295 # and coherent with the data array
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
296 assert fieldslice.start>=0 and fieldslice.stop<=self.width
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
297
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
298 def __getattr__(self,fieldname):
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
299 """
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
300 Return a numpy array with the content associated with the given field name.
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
302 than the dataset itself) is returned.
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
303 """
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
304 if len(self.data)==1:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
305 return self.data[0,self.fields[fieldname]]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
306 return self.data[:,self.fields[fieldname]]
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
307
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
308 def __call__(self,*fieldnames):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
309 """Return a sub-dataset containing only the given fieldnames as fields."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
310 min_col=self.data.shape[1]
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
311 max_col=0
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
312 for field_slice in self.fields.values():
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
313 min_col=min(min_col,field_slice.start)
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
314 max_col=max(max_col,field_slice.stop)
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
315 new_fields={}
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
316 for field in self.fields:
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
319
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
320 def fieldNames(self):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
321 """Return the list of field names that are supported by getattr and getFields."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
322 return self.fields.keys()
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
323
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
324 def __len__(self):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
325 """len(dataset) returns the number of examples in the dataset."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
326 return len(self.data)
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
327
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
328 def __getitem__(self,i):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
329 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
331 the result is just a numpy array (for the i-th row of the dataset data matrix).
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
332 """
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
333 if self.fields:
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
334 fieldnames,fieldslices=zip(*self.fields.items())
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
336 else:
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
337 return self.data[i]
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
338
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
339 def __getslice__(self,*slice_args):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
341 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
342
8
d1c394486037 Replaced asarray() method by __array__ method which gets called automatically when
bengioy@bengiomac.local
parents: 7
diff changeset
343 def __array__(self):
15
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
344 """Return an view of this dataset which is an numpy.ndarray
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
345
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
346 Numpy uses this special function name to retrieve an ndarray view for
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
347 function such as numpy.sum, numpy.dot, numpy.asarray, etc.
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
348
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
349 If this dataset has no fields, then we simply return self.data,
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
350 otherwise things are complicated.
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
351 - why do we want this behaviour when there are fields? (JB)
88168361a5ab comment re: ArrayDataSet.__array__
bergstrj@iro.umontreal.ca
parents: 9
diff changeset
352 """
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
353 if not self.fields:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
354 return self.data
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
355 # else, select subsets of columns mapped by the fields
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
356 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
357 for field_slice in self.fields.values():
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
358 for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
359 columns_used[c]=True
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
360 # try to figure out if we can map all the slices into one slice:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
361 mappable_to_one_slice = True
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
362 start=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
363 while start<len(columns_used) and not columns_used[start]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
364 start+=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
365 stop=len(columns_used)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
366 while stop>0 and not columns_used[stop-1]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
367 stop-=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
368 step=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
369 i=start
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
370 while i<stop:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
371 j=i+1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
372 while j<stop and not columns_used[j]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
373 j+=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
374 if step:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
375 if step!=j-i:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
376 mappable_to_one_slice = False
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
377 break
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
378 else:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
379 step = j-i
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
380 i=j
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
381 if mappable_to_one_slice:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
382 return self.data[:,slice(start,stop,step)]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
383 # else make contiguous copy
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
384 n_columns = sum(columns_used)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
385 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
386 print result.shape
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
387 c=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
388 for field_slice in self.fields.values():
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
389 slice_width=field_slice.stop-field_slice.start/field_slice.step
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
390 # copy the field here
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
391 result[:,slice(c,slice_width)]=self.data[:,field_slice]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
392 c+=slice_width
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
393 return result
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
394
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
395