# HG changeset patch # User Frederic Bastien # Date 1209494444 14400 # Node ID e3ac93e27e162b857fa289fa6d6cbaf6f693e9e8 # Parent 59757365a057e9bfb6acefa3e3a695f1ad7a86be# Parent 718befdc867137209e968542e70490c4dcb2f5a7 Automated merge with ssh://p-omega1@lgcm.iro.umontreal.ca/tlearn diff -r 59757365a057 -r e3ac93e27e16 dataset.py --- a/dataset.py Tue Apr 29 14:40:33 2008 -0400 +++ b/dataset.py Tue Apr 29 14:40:44 2008 -0400 @@ -8,7 +8,6 @@ class AbstractFunction (Exception): """Derived class must override this function""" class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" -#class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)""" class DataSet(object): """A virtual base class for datasets. @@ -19,7 +18,7 @@ python object, which depends on the particular dataset. We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method - should raise an UnboundedDataSet exception). + should return sys.maxint). A DataSet is a generator of iterators; these iterators can run through the examples or the fields in a variety of ways. A DataSet need not necessarily have a finite @@ -304,11 +303,17 @@ def __len__(self): """ len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded length (raises UnboundedDataSet). + By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). Sub-classes which implement finite-length datasets should redefine this method. Some methods only make sense for finite-length datasets. """ - raise UnboundedDataSet() + return sys.maxint + + def is_unbounded(self): + """ + Tests whether a dataset is unbounded (e.g. a stream). + """ + return len(self)==sys.maxint def hasFields(self,*fieldnames): """ @@ -380,7 +385,8 @@ elif type(i) is list: rows = i if rows is not None: - fields_values = zip(*[self[row] for row in rows]) + examples = [self[row] for row in rows] + fields_values = zip(*examples) return MinibatchDataSet( Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values) for fieldname,field_values @@ -592,15 +598,19 @@ return self.length def __getitem__(self,i): - return DataSetFields(MinibatchDataSet( - Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields) + if type(i) in (int,slice,list): + return DataSetFields(MinibatchDataSet( + Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields) + if self.hasFields(i): + return self.fields[i] + return self.__dict__[i] def fieldNames(self): return self.fields.keys() def hasFields(self,*fieldnames): for fieldname in fieldnames: - if fieldname not in self.fields: + if fieldname not in self.fields.keys(): return False return True @@ -749,11 +759,8 @@ # We use this map from row index to dataset index for constant-time random access of examples, # to avoid having to search for the appropriate dataset each time and slice is asked for. for dataset,k in enumerate(datasets[0:-1]): - try: - L=len(dataset) - except UnboundedDataSet: - print "All VStacked datasets (except possibly the last) must be bounded (have a length)." - assert False + assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). + L=len(dataset) for i in xrange(L): self.index2dataset[self.length+i]=k self.datasets_start_row.append(self.length) diff -r 59757365a057 -r e3ac93e27e16 misc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc.py Tue Apr 29 14:40:44 2008 -0400 @@ -0,0 +1,8 @@ + +def unique_elements_list_intersection(list1,list2): + """ + Return the unique elements that are in both list1 and list2 + (repeated elements in listi will not be duplicated in the result). + This should run in O(n1+n2) where n1=|list1|, n2=|list2|. + """ + return list(set.intersection(set(list1),set(list2))) diff -r 59757365a057 -r e3ac93e27e16 test_dataset.py --- a/test_dataset.py Tue Apr 29 14:40:33 2008 -0400 +++ b/test_dataset.py Tue Apr 29 14:40:44 2008 -0400 @@ -19,6 +19,6 @@ print "minibatch=",minibatch for var in minibatch: print "var=",var - print "take a slice:",ds[1:6:2] + print "take a slice and look at field y",ds[1:6:2]["y"] test1()