comparison dataset.py @ 77:1e2bb5bad636

toying with different ways to implement learners
author bengioy@bengiomac.local
date Sun, 04 May 2008 15:09:22 -0400
parents b4159cbdc06b
children 3499918faa9d
comparison
equal deleted inserted replaced
76:ccd6ae89a7c4 77:1e2bb5bad636
125 - hasFields 125 - hasFields
126 - __getitem__ may not be feasible with some streams 126 - __getitem__ may not be feasible with some streams
127 - __iter__ 127 - __iter__
128 """ 128 """
129 129
130 numpy_vstack = lambda fieldname,values: return numpy.vstack(values)
131 numpy_hstack = lambda fieldnames,values: return numpy.hstack(values)
132
130 def __init__(self,description=None,fieldtypes=None): 133 def __init__(self,description=None,fieldtypes=None):
131 if description is None: 134 if description is None:
132 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" 135 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
133 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" 136 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
134 self.description=description 137 self.description=description
968 971
969 Optionally, for finite-length dataset, all the values can be computed 972 Optionally, for finite-length dataset, all the values can be computed
970 (and cached) upon construction of the CachedDataSet, rather at the 973 (and cached) upon construction of the CachedDataSet, rather at the
971 first access. 974 first access.
972 975
973 TODO: add disk-buffering capability, so that when the cache becomes too 976 @todo when cache_all_upon_construction create mini-batches that are as
977 large as possible but not so large as to fill up memory.
978
979 @todo add disk-buffering capability, so that when the cache becomes too
974 big for memory, we cache things on disk, trying to keep in memory only 980 big for memory, we cache things on disk, trying to keep in memory only
975 the record most likely to be accessed next. 981 the record most likely to be accessed next.
976 """ 982 """
977 def __init__(self,source_dataset,cache_all_upon_construction=False): 983 def __init__(self,source_dataset,cache_all_upon_construction=False):
978 self.source_dataset=source_dataset 984 self.source_dataset=source_dataset
979 self.cache_all_upon_construction=cache_all_upon_construction 985 self.cache_all_upon_construction=cache_all_upon_construction
980 if cache_all_upon_construction: 986 if cache_all_upon_construction:
987 # this potentially brings all the source examples
988 # into memory at once, which may be too much
989 # the work could possibly be done by minibatches
990 # that are as large as possible but no more than what memory allows.
981 self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()) 991 self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next())
982 else: 992 else:
983 self.cached_examples = [] 993 self.cached_examples = []
984 994
985 self.fieldNames = source_dataset.fieldNames 995 self.fieldNames = source_dataset.fieldNames