Mercurial > pylearn
comparison dataset.py @ 77:1e2bb5bad636
toying with different ways to implement learners
author | bengioy@bengiomac.local |
---|---|
date | Sun, 04 May 2008 15:09:22 -0400 |
parents | b4159cbdc06b |
children | 3499918faa9d |
comparison
equal
deleted
inserted
replaced
76:ccd6ae89a7c4 | 77:1e2bb5bad636 |
---|---|
125 - hasFields | 125 - hasFields |
126 - __getitem__ may not be feasible with some streams | 126 - __getitem__ may not be feasible with some streams |
127 - __iter__ | 127 - __iter__ |
128 """ | 128 """ |
129 | 129 |
130 numpy_vstack = lambda fieldname,values: return numpy.vstack(values) | |
131 numpy_hstack = lambda fieldnames,values: return numpy.hstack(values) | |
132 | |
130 def __init__(self,description=None,fieldtypes=None): | 133 def __init__(self,description=None,fieldtypes=None): |
131 if description is None: | 134 if description is None: |
132 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" | 135 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" |
133 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" | 136 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" |
134 self.description=description | 137 self.description=description |
968 | 971 |
969 Optionally, for finite-length dataset, all the values can be computed | 972 Optionally, for finite-length dataset, all the values can be computed |
970 (and cached) upon construction of the CachedDataSet, rather at the | 973 (and cached) upon construction of the CachedDataSet, rather at the |
971 first access. | 974 first access. |
972 | 975 |
973 TODO: add disk-buffering capability, so that when the cache becomes too | 976 @todo when cache_all_upon_construction create mini-batches that are as |
977 large as possible but not so large as to fill up memory. | |
978 | |
979 @todo add disk-buffering capability, so that when the cache becomes too | |
974 big for memory, we cache things on disk, trying to keep in memory only | 980 big for memory, we cache things on disk, trying to keep in memory only |
975 the record most likely to be accessed next. | 981 the record most likely to be accessed next. |
976 """ | 982 """ |
977 def __init__(self,source_dataset,cache_all_upon_construction=False): | 983 def __init__(self,source_dataset,cache_all_upon_construction=False): |
978 self.source_dataset=source_dataset | 984 self.source_dataset=source_dataset |
979 self.cache_all_upon_construction=cache_all_upon_construction | 985 self.cache_all_upon_construction=cache_all_upon_construction |
980 if cache_all_upon_construction: | 986 if cache_all_upon_construction: |
987 # this potentially brings all the source examples | |
988 # into memory at once, which may be too much | |
989 # the work could possibly be done by minibatches | |
990 # that are as large as possible but no more than what memory allows. | |
981 self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()) | 991 self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()) |
982 else: | 992 else: |
983 self.cached_examples = [] | 993 self.cached_examples = [] |
984 | 994 |
985 self.fieldNames = source_dataset.fieldNames | 995 self.fieldNames = source_dataset.fieldNames |