annotate dataset.py @ 11:be128b9127c8

Debugged (to the extent of my tests) the new version of dataset
author bengioy@esprit.iro.umontreal.ca
date Wed, 26 Mar 2008 15:01:30 -0400
parents de616c423dbd
children ff4e551490f1 813723310d75
rev   line source
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
1
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
2 class Example(object):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
3 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
4 An example is something that is like a tuple but whose elements can be named, to that
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
5 following syntactic constructions work as one would expect:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
6 example.x = [1, 2, 3] # set a field
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
7 x, y, z = example
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
8 x = example[0]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
9 x = example["x"]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
10 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
11 def __init__(self,names,values):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
12 assert len(values)==len(names)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
13 self.__dict__['values']=values
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
14 self.__dict__['fields']={}
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
15 for i in xrange(len(values)):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
16 self.fields[names[i]]=i
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
17
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
18 def __getitem__(self,i):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
19 if isinstance(i,int):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
20 return self.values[i]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
21 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
22 return self.values[self.fields[i]]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
23
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
24 def __setitem__(self,i,value):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
25 if isinstance(i,int):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
26 self.values[i]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
27 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
28 self.values[self.fields[i]]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
29
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
30 def __getattr__(self,name):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
31 return self.values[self.fields[name]]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
32
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
33 def __setattr__(self,name,value):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
34 self.values[self.fields[name]]=value
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
35
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
36 def __len__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
37 return len(self.values)
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
38
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
39
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
40 class DataSet(object):
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
41 """
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
42 This is a virtual base class or interface for datasets.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
43 A dataset is basically an iterator over Examples (or anything that
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
44 behaves like an Example). It does not necessarily
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
45 have a fixed length (this is useful for 'streams' which feed on-line learning).
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
46 Datasets with fixed and known length are instances of FiniteDataSet, a subclass
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
47 which supports indexing (dataset[i]) and slicing (dataset[1000:2000]).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
48 To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
49 method which returns an iterator over only the desired fields.
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
50 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
51 The content of a field can be of any type, but often will be a numpy array.
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
52 If one iterates through minibatches of examples (with the minibatches() method
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
53 or with the minibatch_size argument of the zip() method), then the fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
54 returned by the iterator's next method should be iterators over the
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
55 individual values within the minibatch (typically these will be arrays
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
56 with minibatch_size rows).
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
57 """
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
58
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
59 def __init__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
60 pass
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
61
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
62 def __iter__(self):
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
63 """
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
64 Return an iterator, whose next() method returns the next example or the next
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
65 minibatch in the dataset. A minibatch (of length > 1) is also an example, but
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
66 whose fields should be something one can iterate on again in order to obtain
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
67 the individual examples.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
68 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
69 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
70
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
71 def zip(self,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
72 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
73 Return an iterator which sees only the specified fields (each fieldname is a
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
74 field key, typically a string). The value returned at each iteration
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
75 is a tuple with one element per field. Hence it can be used like this:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
76 for f1, f2, f3 in dataset.zip('field1','field2','field3'):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
77 ... use f1, f2, and f3
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
78 """
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
79 raise NotImplementedError
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
80
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
81 def minibatches(self,minibatch_size,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
82 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
83 Similar to zip but iterate over minibatches.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
84 Return a minibatch iterator, whose next() method returns an 'example'
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
85 whose fields are iteratable objects (which can iterate over the individual
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
86 values of that field in the minibatch).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
87 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
88 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
89
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
90 def fieldNames(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
91 """Return the list of field names in the examples of this dataset."""
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
92 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
93
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
94 def rename(*new_field_specifications):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
95 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
96 Return a new dataset that maps old fields (of self) to new fields (of the returned
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
97 dataset). The minimal syntax that should be supported is the following:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
98 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
99 new_field_spec = ([old_field1, old_field2, ...], new_field)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
100 In general both old_field and new_field should be strings, but some datasets may also
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
101 support additional indexing schemes within each field (e.g. column slice
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
102 of a matrix-like field).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
103 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
104 raise NotImplementedError
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
105
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
106 class FiniteDataSet(DataSet):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
107 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
108 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
109 Examples are indexed by an integer between 0 and self.length()-1,
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
110 and a subdataset can be obtained by slicing. This may not be appropriate in general
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
111 but only for datasets which can be thought of like ones that access rows AND fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
112 in an efficient random access way. Users are encouraged to expect only the generic dataset
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
113 interface in general. A FiniteDataSet is mainly useful when one has to obtain
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
114 a subset of examples (e.g. for splitting a dataset into training and test sets).
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
115 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
116
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
117 def __init__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
118 pass
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
119
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
120 def __iter__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
121 return FiniteDataSetIterator(self)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
122
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
123 def zip(self,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
124 return FiniteDataSetIterator(self,1,fieldnames)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
125
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
126 def minibatches(self,minibatch_size,*fieldnames):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
127 return FiniteDataSetIterator(self,minibatch_size,fieldnames)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
128
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
129 def __getattr__(self,fieldname):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
130 """Return an that can iterate over the values of the field in this dataset."""
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
131 return self(fieldname)
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
132
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
133 def __call__(self,*fieldnames):
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
134 """Return a sub-dataset containing only the given fieldnames as fields."""
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
135 raise NotImplementedError
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
136
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
137 def __len__(self):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
138 """len(dataset) returns the number of examples in the dataset."""
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
139 raise NotImplementedError
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
140
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
141 def __getitem__(self,i):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
142 """dataset[i] returns the (i+1)-th example of the dataset."""
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
143 raise NotImplementedError
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
144
2
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
145 def __getslice__(self,*slice_args):
3fddb1c8f955 Rewrote DataSet interface and created FiniteDataSet interface.
bengioy@bengiomac.local
parents: 1
diff changeset
146 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
147 raise NotImplementedError
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
148
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
149 class FiniteDataSetIterator(object):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
150 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
151 If the fieldnames list is empty, it means that we want to see ALL the fields.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
152 """
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
153 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
154 self.dataset=dataset
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
155 self.minibatch_size=minibatch_size
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
156 assert minibatch_size>=1 and minibatch_size<=len(dataset)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
157 self.current = -self.minibatch_size
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
158 self.fieldnames = fieldnames
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
159
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
160 def __iter__(self):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
161 return self
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
162
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
163 def next(self):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
164 self.current+=self.minibatch_size
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
165 if self.current>=len(self.dataset):
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
166 self.current=-self.minibatch_size
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
167 raise StopIteration
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
168 if self.minibatch_size==1:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
169 complete_example=self.dataset[self.current]
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
170 else:
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
171 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
172 if self.fieldnames:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
173 return Example(self.fieldnames,list(complete_example))
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
174 else:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
175 return complete_example
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
176
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
177
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
178 # we may want ArrayDataSet defined in another python file
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
179
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
180 import numpy
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
181
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
182 class ArrayDataSet(FiniteDataSet):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
183 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
184 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
185 from DataSet (and the ability to view multiple field values as an 'Example').
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
186 It is a fixed-length and fixed-width dataset
9
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
187 in which each element is a numpy array or a number, hence the whole
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
188 dataset corresponds to a numpy array. Fields
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
189 must correspond to a slice of array columns. If the dataset has fields,
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
190 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
9
de616c423dbd Improving comments in dataset.py
bengioy@esprit.iro.umontreal.ca
parents: 8
diff changeset
191 Any dataset can also be converted to a numpy array (losing the notion of fields
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
192 by the numpy.array(dataset) call.
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
193 """
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
194
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
195 def __init__(self,dataset=None,data=None,fields={}):
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
196 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
197 There are two ways to construct an ArrayDataSet: (1) from an
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
198 existing dataset (which may result in a copy of the data in a numpy array),
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
199 or (2) from a numpy.array (the data argument), along with an optional description
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
200 of the fields (dictionary of column slices indexed by field names).
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
201 """
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
202 if dataset!=None:
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
203 assert data==None and fields=={}
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
204 # Make ONE big minibatch with all the examples, to separate the fields.
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
205 n_examples=len(dataset)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
206 batch = dataset.minibatches(n_examples).next()
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
207 # Each field of the underlying dataset must be convertible to a numpy array of the same type
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
208 # currently just double, but should use the smallest compatible dtype
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
209 n_fields = len(batch)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
210 fieldnames = batch.fields.keys()
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
211 total_width = 0
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
212 type = None
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
213 for i in xrange(n_fields):
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
214 field = array(batch[i])
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
215 assert field.shape[0]==n_examples
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
216 width = field.shape[1]
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
217 start=total_width
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
218 total_width += width
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
219 fields[fieldnames[i]]=slice(start,total_width,1)
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
220 # many complicated things remain to be done:
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
221 # - find common dtype
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
222 # - decide what to do with extra dimensions if not the same in all fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
223 # - try to see if we can avoid the copy?
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
224 raise NotImplementedError
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
225 if data!=None:
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
226 assert dataset==None
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
227 self.data=data
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
228 self.fields=fields
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
229 self.width = data.shape[1]
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
230 for fieldname in fields:
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
231 fieldslice=fields[fieldname]
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
232 # make sure fieldslice.start and fieldslice.step are defined
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
233 start=fieldslice.start
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
234 step=fieldslice.step
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
235 if not start:
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
236 start=0
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
237 if not step:
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
238 step=1
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
239 if not fieldslice.start or not fieldslice.step:
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
240 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
241 # and coherent with the data array
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
242 assert fieldslice.start>=0 and fieldslice.stop<=self.width
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
243
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
244 def __getattr__(self,fieldname):
4
f7dcfb5f9d5b Added test for dataset.
bengioy@bengiomac.local
parents: 3
diff changeset
245 """
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
246 Return a numpy array with the content associated with the given field name.
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
247 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
248 than the dataset itself) is returned.
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
249 """
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
250 if len(self.data)==1:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
251 return self.data[0,self.fields[fieldname]]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
252 return self.data[:,self.fields[fieldname]]
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
253
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
254 def __call__(self,*fieldnames):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
255 """Return a sub-dataset containing only the given fieldnames as fields."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
256 min_col=self.data.shape[1]
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
257 max_col=0
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
258 for field_slice in self.fields.values():
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
259 min_col=min(min_col,field_slice.start)
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
260 max_col=max(max_col,field_slice.stop)
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
261 new_fields={}
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
262 for field in self.fields:
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
263 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
264 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
265
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
266 def fieldNames(self):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
267 """Return the list of field names that are supported by getattr and getFields."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
268 return self.fields.keys()
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
269
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
270 def __len__(self):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
271 """len(dataset) returns the number of examples in the dataset."""
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
272 return len(self.data)
1
2cd82666b9a7 Added statscollector and started writing dataset and learner.
bengioy@esprit.iro.umontreal.ca
parents: 0
diff changeset
273
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
274 def __getitem__(self,i):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
275 """
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
276 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
277 the result is just a numpy array (for the i-th row of the dataset data matrix).
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
278 """
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
279 if self.fields:
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
280 fieldnames,fieldslices=zip(*self.fields.items())
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
281 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
282 else:
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
283 return self.data[i]
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
284
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
285 def __getslice__(self,*slice_args):
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
286 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
6
d5738b79089a Removed MinibatchIterator and instead made minibatch_size a field of all DataSets,
bengioy@bengiomac.local
parents: 5
diff changeset
287 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
3
378b68d5c4ad Added first (untested) version of ArrayDataSet
bengioy@bengiomac.local
parents: 2
diff changeset
288
8
d1c394486037 Replaced asarray() method by __array__ method which gets called automatically when
bengioy@bengiomac.local
parents: 7
diff changeset
289 def __array__(self):
7
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
290 if not self.fields:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
291 return self.data
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
292 # else, select subsets of columns mapped by the fields
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
293 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
294 for field_slice in self.fields.values():
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
295 for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
296 columns_used[c]=True
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
297 # try to figure out if we can map all the slices into one slice:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
298 mappable_to_one_slice = True
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
299 start=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
300 while start<len(columns_used) and not columns_used[start]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
301 start+=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
302 stop=len(columns_used)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
303 while stop>0 and not columns_used[stop-1]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
304 stop-=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
305 step=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
306 i=start
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
307 while i<stop:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
308 j=i+1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
309 while j<stop and not columns_used[j]:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
310 j+=1
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
311 if step:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
312 if step!=j-i:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
313 mappable_to_one_slice = False
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
314 break
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
315 else:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
316 step = j-i
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
317 i=j
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
318 if mappable_to_one_slice:
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
319 return self.data[:,slice(start,stop,step)]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
320 # else make contiguous copy
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
321 n_columns = sum(columns_used)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
322 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
323 print result.shape
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
324 c=0
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
325 for field_slice in self.fields.values():
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
326 slice_width=field_slice.stop-field_slice.start/field_slice.step
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
327 # copy the field here
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
328 result[:,slice(c,slice_width)]=self.data[:,field_slice]
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
329 c+=slice_width
6f8f338686db Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
bengioy@bengiomac.local
parents: 6
diff changeset
330 return result
11
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
331
be128b9127c8 Debugged (to the extent of my tests) the new version of dataset
bengioy@esprit.iro.umontreal.ca
parents: 9
diff changeset
332