# HG changeset patch # User Guillaume Sicard # Date 1269261288 14400 # Node ID f6d9b6b89c2ad01c743a47529152936e9025c5b5 # Parent a92ec9939e4f7b7f16cd3cb73f59f4b3e0dc1cfc ajouté : module de préparation de batches en fonction d'un ratio de classes diff -r a92ec9939e4f -r f6d9b6b89c2a scripts/setup_batches.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/setup_batches.py Mon Mar 22 08:34:48 2010 -0400 @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +import random +from pylearn.io import filetensor as ft + +class Batches(): + def __init__(self): + data_path = '/data/lisa/data/nist/by_class/' + + digits_train_data = 'digits/digits_train_data.ft' + digits_train_labels = 'digits/digits_train_labels.ft' + digits_test_data = 'digits/digits_test_data.ft' + digits_test_labels = 'digits/digits_test_labels.ft' + + lower_train_data = 'lower/lower_train_data.ft' + lower_train_labels = 'lower/lower_train_labels.ft' + #upper_train_data = 'upper/upper_train_data.ft' + #upper_train_labels = 'upper/upper_train_labels.ft' + + f_digits_train_data = open(data_path + digits_train_data) + f_digits_train_labels = open(data_path + digits_train_labels) + f_digits_test_data = open(data_path + digits_test_data) + f_digits_test_labels = open(data_path + digits_test_labels) + + f_lower_train_data = open(data_path + lower_train_data) + f_lower_train_labels = open(data_path + lower_train_labels) + #f_upper_train_data = open(data_path + upper_train_data) + #f_upper_train_labels = open(data_path + upper_train_labels) + + self.raw_digits_train_data = ft.read(f_digits_train_data) + self.raw_digits_train_labels = ft.read(f_digits_train_labels) + self.raw_digits_test_data = ft.read(f_digits_test_data) + self.raw_digits_test_labels = ft.read(f_digits_test_labels) + + self.raw_lower_train_data = ft.read(f_lower_train_data) + self.raw_lower_train_labels = ft.read(f_lower_train_labels) + #self.raw_upper_train_data = ft.read(f_upper_train_data) + #self.raw_upper_train_labels = ft.read(f_upper_train_labels) + + f_digits_train_data.close() + f_digits_train_labels.close() + f_digits_test_data.close() + f_digits_test_labels.close() + + f_lower_train_data.close() + f_lower_train_labels.close() + #f_upper_train_data.close() + #f_upper_train_labels.close() + + def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False): + self.batch_size = batch_size + + digits_train_size = len(self.raw_digits_train_labels) + digits_test_size = len(self.raw_digits_test_labels) + + lower_train_size = len(self.raw_lower_train_labels) + #upper_train_size = len(self.raw_upper_train_labels) + + if verbose == True: + print 'digits_train_size = %d' %digits_train_size + print 'digits_test_size = %d' %digits_test_size + print 'lower_train_size = %d' %lower_train_size + #print 'upper_train_size = %d' %upper_train_size + + # define main and other datasets + raw_main_train_data = self.raw_digits_train_data + raw_other_train_data = self.raw_lower_train_labels + raw_test_data = self.raw_digits_test_labels + + raw_main_train_labels = self.raw_digits_train_labels + raw_other_train_labels = self.raw_lower_train_labels + raw_test_labels = self.raw_digits_test_labels + + main_train_size = len(raw_main_train_data) + other_train_size = len(raw_other_train_data) + test_size = len(raw_test_data) + test_size = int(test_size/batch_size) + test_size *= batch_size + validation_size = test_size + + # default ratio is actual ratio + if start_ratio == -1: + self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.start_ratio = start_ratio + + if start_ratio == -1: + self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.end_ratio = end_ratio + + if verbose == True: + print 'start_ratio = %f' %self.start_ratio + print 'end_ratio = %f' %self.end_ratio + + i_main = 0 + i_other = 0 + i_batch = 0 + + # compute the number of batches given start and end ratios + n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_batches = min([n_main_batch, n_other_batch]) + + # train batches + self.train_batches = [] + + # as long as we have data left in main and other, we create batches + while i_main < main_train_size - batch_size - test_size and i_other < other_train_size - batch_size: + + ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches + batch_data = [] + batch_labels = [] + + for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio + rnd = random.randint(0, 100) + + if rnd < 100 * ratio: + batch_data = batch_data + \ + [raw_main_train_data[i_main]] + batch_labels = batch_labels + \ + [raw_main_train_labels[i_main]] + i_main += 1 + else: + batch_data = batch_data + \ + [raw_other_train_data[i_other]] + batch_labels = batch_labels + \ + [raw_other_train_labels[i_other]] + i_other += 1 + + self.train_batches = self.train_batches + \ + [(batch_data,batch_labels)] + i_batch += 1 + + offset = i_main + + if verbose == True: + print 'n_main = %d' %i_main + print 'n_other = %d' %i_other + print 'nb_train_batches = %d / %d' %(i_batch,n_batches) + print 'offset = %d' %offset + + # test batches + self.test_batches = [] + for i in xrange(0, test_size, batch_size): + self.test_batches = self.test_batches + \ + [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])] + + # validation batches + self.validation_batches = [] + for i in xrange(0, test_size, batch_size): + self.validation_batches = self.validation_batches + \ + [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])] + + def get_train_batches(self): + return self.train_batches + + def get_test_batches(self): + return self.test_batches + + def get_validation_batches(self): + return self.validation_batches + + def test_set_batches(self, intervall = 1000): + for i in xrange(0, len(self.train_batches) - self.batch_size, intervall): + n_main = 0 + + for j in xrange(0, self.batch_size): + if self.train_batches[i][1][j] < 10: + n_main +=1 + print 'ratio batch %d : %f' %(i,float(n_main) / float(self.batch_size)) + +if __name__ == '__main__': + batches = Batches() + batches.set_batches(0.5,1, 20, True) + batches.test_set_batches()