view bin/pkldu.py @ 1441:c9179b0ed002

pca - better comments
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 02 Mar 2011 13:13:40 -0500
parents 4b27456d3bce
children 509d6669429d
line wrap: on
line source

#!/bin/env python
"""
 Script to analyze disk usage of pickled files. See usage.
"""
__authors__ = "Ian Goodfellow, Razvan Pascanu"
__copyright__ = "(c) 2010, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>"

import  cPickle, optparse, time, sys


usage = """
    pkldu [OPTIONS] file indices

First argument of the program is the file to analyze. Following arguments
help you indexing in the object. For example :
    pkldu.py foo.pkl .my_field [my_key] 3

will load an object obj from foo.pkl and analyze obj.my_field["my_key"][3]
    """

space_units = [(' B', 1),
               ('kB', 2**10),
               ('MB', 2**20),
               ('GB', 2**30),
               ('TB', 2**40)]

time_units = [('s', 1),
              ('m', 60),
              ('h', 3600) ]

def load(filepath):
    f = open(filepath,'rb')
    obj = cPickle.load(f)
    f.close()
    return obj

def format_string(s, maxlen):
    if len(s) > maxlen:
        s = s[:maxlen]
    return s + ' '*(maxlen - len(s))

def prettyprint(size, units, human_readable = False):
    unit_name = units[0][0]
    rval      = size
    if human_readable:
        for unit, val in units:
            if float(size)/val > 1:
                unit_name = unit
                rval      = float(size)/val
    return (rval, unit_name)


def analyze(options, filepath, indices):

    orig_obj = load(filepath)
    cycle_check = {}
    obj_name = 'root_obj'
    cycle_check[id(orig_obj)] = obj_name

    for field in indices:
        if field.startswith('['):
            assert field.endswith(']')
            obj_name += '[' + field[1:-1] + ']'
            orig_obj = orig_obj[field[1:-1]]
        elif field.startswith('.'):
            obj_name += '.' + field
            orig_obj = getattr(orig_obj,field[1:])
        else:
            obj_name + '[' + field + ']'
            orig_obj = orig_obj[eval(field)]
        if id(orig_obj) in cycle_check:
            print ( "You're going in circles, "+obj_name+" is the same as "
                   +cycle_check[id(orig_obj)])
            quit()
        cycle_check[id(orig_obj)] = obj_name

    s = cPickle.dumps(orig_obj)
    prev_bytes = len(s)
    print 'original object : \t\t\t\t%6.2f %s'%prettyprint(prev_bytes,
                                                           space_units,
                                                           options.human)

    t1 = time.time()
    x = cPickle.loads(s)
    t2 = time.time()
    prev_t = t2 - t1
    print 'load time: %6.2f %s'%prettyprint(prev_t, time_units,
                                            options.human)

    print_entries = []
    if isinstance(orig_obj, dict):
        print 'Object is a dictionary'
        keys = [ key for key in orig_obj.keys() ]
        for key in keys:
            key_name = format_string(key, 40)
            s = cPickle.dumps(orig_obj[key])
            new_bytes = len(s)
            t1 = time.time()
            x = cPickle.loads(s)
            t2 = time.time()
            new_t = t2 - t1
            print_entry = 'key: %40s %6.2f %s ( loads in %6.2f %s)'%(
                (key_name,) +
                prettyprint(new_bytes, space_units, options.human) +
                prettyprint(new_t, time_units, options.human) )
            if options.order is not 'none':
                print 'Processed', key_name
                print_entries += [(new_bytes, print_entry)]
            else:
                print print_entry

    elif isinstance(orig_obj, (tuple, list)):
        print 'Object is a list/tuple of ', len(orig_obj), 'elements'
        for idx, v in enumerate(orig_obj):
            s = cPickle.dumps(v)
            new_bytes = len(s)
            t1 = time.time()
            x = cPickle.loads(s)
            t2 = time.time()
            new_t = t2 - t1
            print_entry = 'entry: %03d \t\t\t\t %6.2f %s ( loads in %6.2f %s)' %(
                (idx,)+
                prettyprint(new_bytes, space_units, options.human) +
                prettyprint(new_t, time_units, options.human) )

            if options.order is not 'none':
                print 'Processed entry number ', idx
                print_entries += [(new_bytes, print_entry)]
            else:
                print print_entry
    else:
        print 'Object is a '+str(type(orig_obj))
        for field in dir(orig_obj):
            field_name = format_string( field, 40)
            if field.startswith('__') and not options.reserved:
                # We skip reserved fields
                break
            try:
                s = cPickle.dumps(getattr(orig_obj, field))
                new_bytes = len(s)
                t1 = time.time()
                x = cPickle.loads(s)
                t2 = time.time()
                new_t = t2 - t1
                print_entry ='field: %40s %6.2f %s ( loads in %6.2f %s)' %(
                    (field_name,)+
                    prettyprint(new_bytes, space_units, options.human) +
                    prettyprint(new_t, time_units, options.human) )

                if options.order is not 'none':
                    print 'Processed field ', field_name
                    print_entries += [(new_bytes, print_entry)]
                else:
                    print print_entry
            except:
                print 'Could not pickle field', field_name
    if options.order in ('desc','asc'):
        reverse = False
        if options.order == 'desc':
            reverse = True
        print_entries = sorted(print_entries
                               , key = lambda x:x[0]
                               , reverse = reverse)
        for entry in print_entries:
            print entry[1]


def process_options():

    parser = optparse.OptionParser(usage)

    parser.add_option( '-H'
                     , "--human-readable"
                     , dest    = 'human'
                      , action="store_true"
                      , default=False
                     , help    = (' If information should be presented in '
                                  'human readable format')
                     )

    parser.add_option( '-r'
                     , "--reserved-fields"
                     , dest    = 'reserved'
                      , action="store_true"
                      , default=False
                     , help    = (' If information about python reserved '
                                  ' fields (i.e. starting with `__`) '
                                  ' should be displayed' )
                     )


    parser.add_option( '-o'
                     , "--order-fields"
                     , dest    = 'order'
                      , default= 'none'
                     , help    = (' Order fields acording the their size.'
                                  ' Possible values are {none, desc, asc}')
                     )

    return parser.parse_args()


if __name__ == '__main__':
    (options,args) = process_options()
    analyze(options, args[0], args[1:])