view bin/pkldu.py @ 1430:931a19eeab5a

'allow to randomize the sparse valid/test utlc dataset at load time'
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 08 Feb 2011 16:19:18 -0500
parents ea5d27727869
children 14ba52c38f07
line wrap: on
line source

#!/bin/env python
import sys
from util import serial
import cPickle
import time

""" Usage:
first argument is a cPickle file to load
if no more arguments are supplied, will analyze the disk usage of each element of the root-level object stored in the file
subsequent arguments let you index into fields / dictionary entries of the object
For example,
pkldu.py foo.pkl .my_field [my_key] 3
will load an object obj from foo.pkl and analyze obj.my_field["my_key"][3]
"""

filepath = sys.argv[1]

orig_obj = serial.load(filepath)

cycle_check = {}

obj_name = 'root_obj'
cycle_check[id(orig_obj)] = obj_name

for field in sys.argv[2:]:
    if field.startswith('['):
        assert field.endswith(']')
        obj_name += '[' + field[1:-1] + ']'
        orig_obj = orig_obj[field[1:-1]]
    elif field.startswith('.'):
        obj_name += '.' + field
        orig_obj = getattr(orig_obj,field[1:])
    else:
        obj_name + '[' + field + ']'
        orig_obj = orig_obj[eval(field)]
    if id(orig_obj) in cycle_check:
        print "You're going in circles, "+obj_name+" is the same as "+cycle_check[id(orig_obj)]
        quit()
    cycle_check[id(orig_obj)] = obj_name

s = cPickle.dumps(orig_obj)
prev_bytes = len(s)
print 'orig_obj bytes: \t\t\t\t'+str(prev_bytes)
t1 = time.time()
x = cPickle.loads(s)
t2 = time.time()
prev_t = t2 - t1
print 'orig load time: '+str(prev_t)


idx = 0

while len(dir(orig_obj)) > idx:
    stop = False

    while True:
        fields = dir(orig_obj)
        if idx >= len(fields):
            stop = True
            break
        field = fields[idx]

        success = True
        try:
            delattr(orig_obj,field)

        except:
            print "got error trying to delete "+field
            idx += 1
            success = False
        if success and field in dir(orig_obj):
            print field + ' reappears after being deleted'
            idx += 1
        if success:
            break

    if stop:
        break

    s = cPickle.dumps(orig_obj)
    new_bytes = len(s)
    diff_bytes = prev_bytes - new_bytes
    prev_bytes = new_bytes
    t1 = time.time()
    x = cPickle.loads(s)
    t2 = time.time()
    new_t = t2 - t1
    diff_t = prev_t - new_t
    prev_t = new_t
    print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)

if type(orig_obj) == type({}):
    print 'orig_obj is a dictionary'

    keys = [ key for key in orig_obj.keys() ]

    for key in keys:
        del orig_obj[key]

        s = cPickle.dumps(orig_obj)
        new_bytes = len(s)
        t1 = time.time()
        x = cPickle.loads(s)
        t2 = time.time()
        new_t = t2 - t1
        diff_t = prev_t - new_t
        prev_t = new_t
        print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)


if type(orig_obj) == type([]):
    print 'orig_obj is a list'

    i = 0
    while len(orig_obj) > 0:
        stringrep = str(orig_obj[0])
        if len(stringrep) > 15:
            stringrep = stringrep[0:12] + "..."
        del orig_obj[0]
        
        s = cPickle.dumps(orig_obj)
        new_bytes = len(s)
        diff_bytes = prev_bytes - new_bytes
        prev_bytes = new_bytes

        t1 = time.time()
        x = cPickle.loads(s)
        t2 = time.time()
        new_t = t2 - t1
        diff_t = prev_t - new_t
        prev_t = new_t
        print field+': \t\t\t\t'+str(diff_bytes)+'\t\t\t'+str(diff_t)


        i+= 1