pylearn: dataset.py comparison

comparison dataset.py @ 77:1e2bb5bad636

toying with different ways to implement learners

author	bengioy@bengiomac.local
date	Sun, 04 May 2008 15:09:22 -0400
parents	b4159cbdc06b
children	3499918faa9d

comparison

equal deleted inserted replaced

-:ccd6ae89a7c4
+:1e2bb5bad636
 - hasFields
 - __getitem__ may not be feasible with some streams
 - __iter__
 """
+numpy_vstack = lambda fieldname,values: return numpy.vstack(values)
+numpy_hstack = lambda fieldnames,values: return numpy.hstack(values)
 def __init__(self,description=None,fieldtypes=None):
 if description is None:
 # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
 description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
 self.description=description
 Optionally, for finite-length dataset, all the values can be computed
 (and cached) upon construction of the CachedDataSet, rather at the
 first access.
-TODO: add disk-buffering capability, so that when the cache becomes too
+@todo when cache_all_upon_construction create mini-batches that are as
+large as possible but not so large as to fill up memory.
+@todo add disk-buffering capability, so that when the cache becomes too
 big for memory, we cache things on disk, trying to keep in memory only
 the record most likely to be accessed next.
 """
 def __init__(self,source_dataset,cache_all_upon_construction=False):
 self.source_dataset=source_dataset
 self.cache_all_upon_construction=cache_all_upon_construction
 if cache_all_upon_construction:
+# this potentially brings all the source examples
+# into memory at once, which may be too much
+# the work could possibly be done by minibatches
+# that are as large as possible but no more than what memory allows.
 self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next())
 else:
 self.cached_examples = []
 self.fieldNames = source_dataset.fieldNames

Mercurial > pylearn

comparison dataset.py @ 77:1e2bb5bad636