# HG changeset patch # User Razvan Pascanu # Date 1298391812 18000 # Node ID 3dd64c115657a49dc09622d4727cbe68ab00de11 # Parent f88508a42a38ec46e02b6230f5c0be5b04b96220 revised version of pkldu that is a bit more structured code wise and outputs in human readable units diff -r f88508a42a38 -r 3dd64c115657 bin/pkldu.py --- a/bin/pkldu.py Tue Feb 22 10:04:50 2011 -0500 +++ b/bin/pkldu.py Tue Feb 22 11:23:32 2011 -0500 @@ -1,16 +1,33 @@ #!/bin/env python -import sys -import cPickle -import time +""" + Script to analyze disk usage of pickled files. See usage. +""" +__authors__ = "Ian Goodfellow, Razvan Pascanu" +__copyright__ = "(c) 2010, Universite de Montreal" +__contact__ = "Razvan Pascanu " + +import cPickle, optparse, time, sys + + +usage = """ + pkldu [OPTIONS] file indices -""" Usage: -first argument is a cPickle file to load -if no more arguments are supplied, will analyze the disk usage of each element of the root-level object stored in the file -subsequent arguments let you index into fields / dictionary entries of the object -For example, -pkldu.py foo.pkl .my_field [my_key] 3 +First argument of the program is the file to analyze. Following arguments +help you indexing in the object. For example : + pkldu.py foo.pkl .my_field [my_key] 3 + will load an object obj from foo.pkl and analyze obj.my_field["my_key"][3] -""" + """ + +space_units = [(' B', 1), + ('kB', 2**10), + ('MB', 2**20), + ('GB', 2**30), + ('TB', 2**40)] + +time_units = [('s', 1), + ('m', 60), + ('h', 3600) ] def load(filepath): f = open(filepath,'rb') @@ -18,123 +35,134 @@ f.close() return obj -filepath = sys.argv[1] - -orig_obj = load(filepath) - -cycle_check = {} - -obj_name = 'root_obj' -cycle_check[id(orig_obj)] = obj_name +def format_string(s, maxlen): + if len(s) > maxlen: + s = s[:maxlen] + return s + ' '*(maxlen - len(s)) -for field in sys.argv[2:]: - if field.startswith('['): - assert field.endswith(']') - obj_name += '[' + field[1:-1] + ']' - orig_obj = orig_obj[field[1:-1]] - elif field.startswith('.'): - obj_name += '.' + field - orig_obj = getattr(orig_obj,field[1:]) - else: - obj_name + '[' + field + ']' - orig_obj = orig_obj[eval(field)] - if id(orig_obj) in cycle_check: - print "You're going in circles, "+obj_name+" is the same as "+cycle_check[id(orig_obj)] - quit() +def prettyprint(size, units, human_readable = False): + unit_name = units[0][0] + rval = size + if human_readable: + for unit, val in units: + if float(size)/val > 1: + unit_name = unit + rval = float(size)/val + return (rval, unit_name) + + +def analyze(options, filepath, indices): + + orig_obj = load(filepath) + cycle_check = {} + obj_name = 'root_obj' cycle_check[id(orig_obj)] = obj_name -s = cPickle.dumps(orig_obj) -prev_bytes = len(s) -print 'orig_obj bytes: \t\t\t\t'+str(prev_bytes) -t1 = time.time() -x = cPickle.loads(s) -t2 = time.time() -prev_t = t2 - t1 -print 'orig load time: '+str(prev_t) - - -idx = 0 - -while len(dir(orig_obj)) > idx: - stop = False - - while True: - fields = dir(orig_obj) - if idx >= len(fields): - stop = True - break - field = fields[idx] - - success = True - try: - delattr(orig_obj,field) - - except: - print "got error trying to delete "+field - idx += 1 - success = False - if success and field in dir(orig_obj): - print field + ' reappears after being deleted' - idx += 1 - if success: - break - - if stop: - break + for field in indices: + if field.startswith('['): + assert field.endswith(']') + obj_name += '[' + field[1:-1] + ']' + orig_obj = orig_obj[field[1:-1]] + elif field.startswith('.'): + obj_name += '.' + field + orig_obj = getattr(orig_obj,field[1:]) + else: + obj_name + '[' + field + ']' + orig_obj = orig_obj[eval(field)] + if id(orig_obj) in cycle_check: + print ( "You're going in circles, "+obj_name+" is the same as " + +cycle_check[id(orig_obj)]) + quit() + cycle_check[id(orig_obj)] = obj_name s = cPickle.dumps(orig_obj) - new_bytes = len(s) - diff_bytes = prev_bytes - new_bytes - prev_bytes = new_bytes + prev_bytes = len(s) + print 'original object : \t\t\t\t%6.2f %s'%prettyprint(prev_bytes, + space_units, + options.human) + t1 = time.time() x = cPickle.loads(s) t2 = time.time() - new_t = t2 - t1 - diff_t = prev_t - new_t - prev_t = new_t - print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t) + prev_t = t2 - t1 + print 'load time: %6.2f %s'%prettyprint(prev_t, time_units, + options.human) -if type(orig_obj) == type({}): - print 'orig_obj is a dictionary' - - keys = [ key for key in orig_obj.keys() ] + if isinstance(orig_obj, dict): + print 'Object is a dictionary' + keys = [ key for key in orig_obj.keys() ] + for key in keys: + key_name = format_string(key, 40) + s = cPickle.dumps(orig_obj[key]) + new_bytes = len(s) + t1 = time.time() + x = cPickle.loads(s) + t2 = time.time() + new_t = t2 - t1 + print 'field: %40s %6.2f %s ( loads in %6.2f %s)'%( + (key_name,) + + prettyprint(new_bytes, space_units, options.human) + + prettyprint(new_t, time_units, options.human) ) - for key in keys: - del orig_obj[key] + elif isinstance(orig_obj, (tuple, list)): + print 'Object is a list/tuple of ', len(orig_obj), 'elements' + for idx, v in enumerate(orig_obj): + s = cPickle.dumps(v) + new_bytes = len(s) + t1 = time.time() + x = cPickle.loads(s) + t2 = time.time() + new_t = t2 - t1 + print 'entry: %03d \t\t\t\t %6.2f %s ( loads in %6.2f %s)' %( + (idx,)+ + prettyprint(new_bytes, space_units, options.human) + + prettyprint(new_t, time_units, options.human) ) + else: + print 'Object is a '+str(type(orig_obj)) + for field in dir(orig_obj): + field_name = format_string( field, 40) + if field.startswith('__') and not options.reserved: + # We skip reserved fields + break + try: + s = cPickle.dumps(getattr(orig_obj, field)) + new_bytes = len(s) + t1 = time.time() + x = cPickle.loads(s) + t2 = time.time() + new_t = t2 - t1 + print 'field: %40s %6.2f %s ( loads in %6.2f %s)' %( + (field_name,)+ + prettyprint(new_bytes, space_units, options.human) + + prettyprint(new_t, time_units, options.human) ) + except: + print 'Could not pickle field', field_name - s = cPickle.dumps(orig_obj) - new_bytes = len(s) - t1 = time.time() - x = cPickle.loads(s) - t2 = time.time() - new_t = t2 - t1 - diff_t = prev_t - new_t - prev_t = new_t - print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t) +def process_options(): + + parser = optparse.OptionParser(usage) + + parser.add_option( '-H' + , "--human-readable" + , dest = 'human' + , action="store_true" + , default=False + , help = (' If information should be presented in ' + 'human readable format') + ) + + parser.add_option( '-r' + , "--reserved-fields" + , dest = 'reserved' + , action="store_true" + , default=False + , help = (' If information about python reserved ' + ' fields (i.e. starting with `__`) ' + ' should be displayed' ) + ) + return parser.parse_args() -if type(orig_obj) == type([]): - print 'orig_obj is a list' - - i = 0 - while len(orig_obj) > 0: - stringrep = str(orig_obj[0]) - if len(stringrep) > 15: - stringrep = stringrep[0:12] + "..." - del orig_obj[0] - - s = cPickle.dumps(orig_obj) - new_bytes = len(s) - diff_bytes = prev_bytes - new_bytes - prev_bytes = new_bytes - - t1 = time.time() - x = cPickle.loads(s) - t2 = time.time() - new_t = t2 - t1 - diff_t = prev_t - new_t - prev_t = new_t - print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t) - - - i+= 1 +if __name__ == '__main__': + (options,args) = process_options() + analyze(options, args[0], args[1:])