Mercurial > pylearn

--- a/bin/pkldu.py	Tue Feb 22 10:04:50 2011 -0500
+++ b/bin/pkldu.py	Tue Feb 22 11:23:32 2011 -0500
@@ -1,16 +1,33 @@
 #!/bin/env python
-import sys
-import cPickle
-import time
+"""
+ Script to analyze disk usage of pickled files. See usage.
+"""
+__authors__ = "Ian Goodfellow, Razvan Pascanu"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
+
+import  cPickle, optparse, time, sys
+
+
+usage = """
+    pkldu [OPTIONS] file indices

-""" Usage:
-first argument is a cPickle file to load
-if no more arguments are supplied, will analyze the disk usage of each element of the root-level object stored in the file
-subsequent arguments let you index into fields / dictionary entries of the object
-For example,
-pkldu.py foo.pkl .my_field [my_key] 3
+First argument of the program is the file to analyze. Following arguments
+help you indexing in the object. For example :
+    pkldu.py foo.pkl .my_field [my_key] 3
+
 will load an object obj from foo.pkl and analyze obj.my_field["my_key"][3]
-"""
+    """
+
+space_units = [(' B', 1),
+               ('kB', 2**10),
+               ('MB', 2**20),
+               ('GB', 2**30),
+               ('TB', 2**40)]
+
+time_units = [('s', 1),
+              ('m', 60),
+              ('h', 3600) ]

 def load(filepath):
     f = open(filepath,'rb')
@@ -18,123 +35,134 @@
     f.close()
     return obj

-filepath = sys.argv[1]
-
-orig_obj = load(filepath)
-
-cycle_check = {}
-
-obj_name = 'root_obj'
-cycle_check[id(orig_obj)] = obj_name
+def format_string(s, maxlen):
+    if len(s) > maxlen:
+        s = s[:maxlen]
+    return s + ' '*(maxlen - len(s))

-for field in sys.argv[2:]:
-    if field.startswith('['):
-        assert field.endswith(']')
-        obj_name += '[' + field[1:-1] + ']'
-        orig_obj = orig_obj[field[1:-1]]
-    elif field.startswith('.'):
-        obj_name += '.' + field
-        orig_obj = getattr(orig_obj,field[1:])
-    else:
-        obj_name + '[' + field + ']'
-        orig_obj = orig_obj[eval(field)]
-    if id(orig_obj) in cycle_check:
-        print "You're going in circles, "+obj_name+" is the same as "+cycle_check[id(orig_obj)]
-        quit()
+def prettyprint(size, units, human_readable = False):
+    unit_name = units[0][0]
+    rval      = size
+    if human_readable:
+        for unit, val in units:
+            if float(size)/val > 1:
+                unit_name = unit
+                rval      = float(size)/val
+    return (rval, unit_name)
+
+
+def analyze(options, filepath, indices):
+
+    orig_obj = load(filepath)
+    cycle_check = {}
+    obj_name = 'root_obj'
     cycle_check[id(orig_obj)] = obj_name

-s = cPickle.dumps(orig_obj)
-prev_bytes = len(s)
-print 'orig_obj bytes: \t\t\t\t'+str(prev_bytes)
-t1 = time.time()
-x = cPickle.loads(s)
-t2 = time.time()
-prev_t = t2 - t1
-print 'orig load time: '+str(prev_t)
-
-
-idx = 0
-
-while len(dir(orig_obj)) > idx:
-    stop = False
-
-    while True:
-        fields = dir(orig_obj)
-        if idx >= len(fields):
-            stop = True
-            break
-        field = fields[idx]
-
-        success = True
-        try:
-            delattr(orig_obj,field)
-
-        except:
-            print "got error trying to delete "+field
-            idx += 1
-            success = False
-        if success and field in dir(orig_obj):
-            print field + ' reappears after being deleted'
-            idx += 1
-        if success:
-            break
-
-    if stop:
-        break
+    for field in indices:
+        if field.startswith('['):
+            assert field.endswith(']')
+            obj_name += '[' + field[1:-1] + ']'
+            orig_obj = orig_obj[field[1:-1]]
+        elif field.startswith('.'):
+            obj_name += '.' + field
+            orig_obj = getattr(orig_obj,field[1:])
+        else:
+            obj_name + '[' + field + ']'
+            orig_obj = orig_obj[eval(field)]
+        if id(orig_obj) in cycle_check:
+            print ( "You're going in circles, "+obj_name+" is the same as "
+                   +cycle_check[id(orig_obj)])
+            quit()
+        cycle_check[id(orig_obj)] = obj_name

     s = cPickle.dumps(orig_obj)
-    new_bytes = len(s)
-    diff_bytes = prev_bytes - new_bytes
-    prev_bytes = new_bytes
+    prev_bytes = len(s)
+    print 'original object : \t\t\t\t%6.2f %s'%prettyprint(prev_bytes,
+                                                           space_units,
+                                                           options.human)
+
     t1 = time.time()
     x = cPickle.loads(s)
     t2 = time.time()
-    new_t = t2 - t1
-    diff_t = prev_t - new_t
-    prev_t = new_t
-    print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)
+    prev_t = t2 - t1
+    print 'load time: %6.2f %s'%prettyprint(prev_t, time_units,
+                                            options.human)

-if type(orig_obj) == type({}):
-    print 'orig_obj is a dictionary'
-
-    keys = [ key for key in orig_obj.keys() ]
+    if isinstance(orig_obj, dict):
+        print 'Object is a dictionary'
+        keys = [ key for key in orig_obj.keys() ]
+        for key in keys:
+            key_name = format_string(key, 40)
+            s = cPickle.dumps(orig_obj[key])
+            new_bytes = len(s)
+            t1 = time.time()
+            x = cPickle.loads(s)
+            t2 = time.time()
+            new_t = t2 - t1
+            print 'field: %40s %6.2f %s ( loads in %6.2f %s)'%(
+                (key_name,) +
+                prettyprint(new_bytes, space_units, options.human) +
+                prettyprint(new_t, time_units, options.human) )

-    for key in keys:
-        del orig_obj[key]
+    elif isinstance(orig_obj, (tuple, list)):
+        print 'Object is a list/tuple of ', len(orig_obj), 'elements'
+        for idx, v in enumerate(orig_obj):
+            s = cPickle.dumps(v)
+            new_bytes = len(s)
+            t1 = time.time()
+            x = cPickle.loads(s)
+            t2 = time.time()
+            new_t = t2 - t1
+            print 'entry: %03d \t\t\t\t %6.2f %s ( loads in %6.2f %s)' %(
+                (idx,)+
+                prettyprint(new_bytes, space_units, options.human) +
+                prettyprint(new_t, time_units, options.human) )
+    else:
+        print 'Object is a '+str(type(orig_obj))
+        for field in dir(orig_obj):
+            field_name = format_string( field, 40)
+            if field.startswith('__') and not options.reserved:
+                # We skip reserved fields
+                break
+            try:
+                s = cPickle.dumps(getattr(orig_obj, field))
+                new_bytes = len(s)
+                t1 = time.time()
+                x = cPickle.loads(s)
+                t2 = time.time()
+                new_t = t2 - t1
+                print 'field: %40s %6.2f %s ( loads in %6.2f %s)' %(
+                    (field_name,)+
+                    prettyprint(new_bytes, space_units, options.human) +
+                    prettyprint(new_t, time_units, options.human) )
+            except:
+                print 'Could not pickle field', field_name

-        s = cPickle.dumps(orig_obj)
-        new_bytes = len(s)
-        t1 = time.time()
-        x = cPickle.loads(s)
-        t2 = time.time()
-        new_t = t2 - t1
-        diff_t = prev_t - new_t
-        prev_t = new_t
-        print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)
+def process_options():
+
+    parser = optparse.OptionParser(usage)
+
+    parser.add_option( '-H'
+                     , "--human-readable"
+                     , dest    = 'human'
+                      , action="store_true"
+                      , default=False
+                     , help    = (' If information should be presented in '
+                                  'human readable format')
+                     )
+
+    parser.add_option( '-r'
+                     , "--reserved-fields"
+                     , dest    = 'reserved'
+                      , action="store_true"
+                      , default=False
+                     , help    = (' If information about python reserved '
+                                  ' fields (i.e. starting with `__`) '
+                                  ' should be displayed' )
+                     )
+    return parser.parse_args()


-if type(orig_obj) == type([]):
-    print 'orig_obj is a list'
-
-    i = 0
-    while len(orig_obj) > 0:
-        stringrep = str(orig_obj[0])
-        if len(stringrep) > 15:
-            stringrep = stringrep[0:12] + "..."
-        del orig_obj[0]
-
-        s = cPickle.dumps(orig_obj)
-        new_bytes = len(s)
-        diff_bytes = prev_bytes - new_bytes
-        prev_bytes = new_bytes
-
-        t1 = time.time()
-        x = cPickle.loads(s)
-        t2 = time.time()
-        new_t = t2 - t1
-        diff_t = prev_t - new_t
-        prev_t = new_t
-        print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)
-
-
-        i+= 1
+if __name__ == '__main__':
+    (options,args) = process_options()
+    analyze(options, args[0], args[1:])