diff dataset.py @ 77:1e2bb5bad636

toying with different ways to implement learners
author bengioy@bengiomac.local
date Sun, 04 May 2008 15:09:22 -0400
parents b4159cbdc06b
children 3499918faa9d
line wrap: on
line diff
--- a/dataset.py	Sat May 03 22:00:37 2008 -0400
+++ b/dataset.py	Sun May 04 15:09:22 2008 -0400
@@ -127,6 +127,9 @@
        - __iter__
     """
 
+    numpy_vstack = lambda fieldname,values: return numpy.vstack(values)
+    numpy_hstack = lambda fieldnames,values: return numpy.hstack(values)
+        
     def __init__(self,description=None,fieldtypes=None):
         if description is None:
             # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
@@ -970,7 +973,10 @@
   (and cached) upon construction of the CachedDataSet, rather at the
   first access.
 
-  TODO: add disk-buffering capability, so that when the cache becomes too
+  @todo when cache_all_upon_construction create mini-batches that are as 
+  large as possible but not so large as to fill up memory.
+  
+  @todo add disk-buffering capability, so that when the cache becomes too
   big for memory, we cache things on disk, trying to keep in memory only
   the record most likely to be accessed next.
   """
@@ -978,6 +984,10 @@
       self.source_dataset=source_dataset
       self.cache_all_upon_construction=cache_all_upon_construction
       if cache_all_upon_construction:
+          # this potentially brings all the source examples
+          # into memory at once, which may be too much
+          # the work could possibly be done by minibatches
+          # that are as large as possible but no more than what memory allows.
           self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next())
       else:
           self.cached_examples = []