Mercurial > pylearn
diff dataset.py @ 77:1e2bb5bad636
toying with different ways to implement learners
author | bengioy@bengiomac.local |
---|---|
date | Sun, 04 May 2008 15:09:22 -0400 |
parents | b4159cbdc06b |
children | 3499918faa9d |
line wrap: on
line diff
--- a/dataset.py Sat May 03 22:00:37 2008 -0400 +++ b/dataset.py Sun May 04 15:09:22 2008 -0400 @@ -127,6 +127,9 @@ - __iter__ """ + numpy_vstack = lambda fieldname,values: return numpy.vstack(values) + numpy_hstack = lambda fieldnames,values: return numpy.hstack(values) + def __init__(self,description=None,fieldtypes=None): if description is None: # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)" @@ -970,7 +973,10 @@ (and cached) upon construction of the CachedDataSet, rather at the first access. - TODO: add disk-buffering capability, so that when the cache becomes too + @todo when cache_all_upon_construction create mini-batches that are as + large as possible but not so large as to fill up memory. + + @todo add disk-buffering capability, so that when the cache becomes too big for memory, we cache things on disk, trying to keep in memory only the record most likely to be accessed next. """ @@ -978,6 +984,10 @@ self.source_dataset=source_dataset self.cache_all_upon_construction=cache_all_upon_construction if cache_all_upon_construction: + # this potentially brings all the source examples + # into memory at once, which may be too much + # the work could possibly be done by minibatches + # that are as large as possible but no more than what memory allows. self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()) else: self.cached_examples = []