diff scripts/setup_batches.py @ 332:5b260cc8f477

Correction de bug numpy array et ajout d'une deuxième classe auxiliaire
author Guillaume Sicard <guitch21@gmail.com>
date Wed, 14 Apr 2010 11:51:18 -0400
parents a6b6b1140de9
children 7bc555cc9aab b0741ea3ff6f
line wrap: on
line diff
--- a/scripts/setup_batches.py	Wed Apr 14 10:17:33 2010 -0400
+++ b/scripts/setup_batches.py	Wed Apr 14 11:51:18 2010 -0400
@@ -15,8 +15,10 @@
 
     lower_train_data = 'lower/lower_train_data.ft'
     lower_train_labels = 'lower/lower_train_labels.ft'
-    #upper_train_data = 'upper/upper_train_data.ft'
-    #upper_train_labels = 'upper/upper_train_labels.ft'
+    upper_train_data = 'upper/upper_train_data.ft'
+    upper_train_labels = 'upper/upper_train_labels.ft'
+    test_data = 'all/all_test_data.ft'
+    test_labels = 'all/all_test_labels.ft'
 
     print 'Opening data...'
 
@@ -27,8 +29,11 @@
 
     f_lower_train_data = open(data_path + lower_train_data)
     f_lower_train_labels = open(data_path + lower_train_labels)
-    #f_upper_train_data = open(data_path + upper_train_data)
-    #f_upper_train_labels = open(data_path + upper_train_labels)
+    f_upper_train_data = open(data_path + upper_train_data)
+    f_upper_train_labels = open(data_path + upper_train_labels)
+
+    f_test_data = open(data_path + test_data)
+    f_test_labels = open(data_path + test_labels)
 
     self.raw_digits_train_data = ft.read(f_digits_train_data)
     self.raw_digits_train_labels = ft.read(f_digits_train_labels)
@@ -37,8 +42,11 @@
 
     self.raw_lower_train_data = ft.read(f_lower_train_data)
     self.raw_lower_train_labels = ft.read(f_lower_train_labels)
-    #self.raw_upper_train_data = ft.read(f_upper_train_data)
-    #self.raw_upper_train_labels = ft.read(f_upper_train_labels)
+    self.raw_upper_train_data = ft.read(f_upper_train_data)
+    self.raw_upper_train_labels = ft.read(f_upper_train_labels)
+
+    self.raw_test_data = ft.read(f_test_data)
+    self.raw_test_labels = ft.read(f_test_labels)
 
     f_digits_train_data.close()
     f_digits_train_labels.close()
@@ -47,8 +55,11 @@
 
     f_lower_train_data.close()
     f_lower_train_labels.close()
-    #f_upper_train_data.close()
-    #f_upper_train_labels.close()
+    f_upper_train_data.close()
+    f_upper_train_labels.close()
+
+    f_test_data.close()
+    f_test_labels.close()
 
     print 'Data opened'
 
@@ -59,25 +70,32 @@
     digits_test_size = len(self.raw_digits_test_labels)
 
     lower_train_size = len(self.raw_lower_train_labels)
-    #upper_train_size = len(self.raw_upper_train_labels)
+    upper_train_size = len(self.raw_upper_train_labels)
 
     if verbose == True:
       print 'digits_train_size = %d' %digits_train_size
       print 'digits_test_size = %d' %digits_test_size
       print 'lower_train_size = %d' %lower_train_size
-      #print 'upper_train_size = %d' %upper_train_size
+      print 'upper_train_size = %d' %upper_train_size
 
     # define main and other datasets
     raw_main_train_data = self.raw_digits_train_data
-    raw_other_train_data = self.raw_lower_train_labels
+    raw_other_train_data1 = self.raw_lower_train_labels
+    raw_other_train_data2 = self.raw_upper_train_labels
     raw_test_data = self.raw_digits_test_data
+    #raw_test_data = self.raw_test_data
 
     raw_main_train_labels = self.raw_digits_train_labels
-    raw_other_train_labels = self.raw_lower_train_labels
+    raw_other_train_labels1 = self.raw_lower_train_labels
+    raw_other_train_labels2 = self.raw_upper_train_labels
     raw_test_labels = self.raw_digits_test_labels
+    #raw_test_labels = self.raw_test_labels
 
-    main_train_size = len(raw_main_train_data)
-    other_train_size = len(raw_other_train_data)
+    main_train_size = len(raw_main_train_labels)
+    other_train_size1 = len(raw_other_train_labels1)
+    other_train_size2 = len(raw_other_train_labels2)
+    other_train_size = other_train_size1 + other_train_size2
+
     test_size = len(raw_test_labels)
     test_size = int(test_size/batch_size)
     test_size *= batch_size
@@ -85,12 +103,12 @@
 
     # default ratio is actual ratio
     if start_ratio == -1:
-      self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+      self.start_ratio = float(main_train_size - test_size) / float(main_train_size + other_train_size)
     else:
       self.start_ratio = start_ratio
 
     if start_ratio == -1:
-      self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+      self.end_ratio = float(main_train_size - test_size) / float(main_train_size + other_train_size)
     else:
       self.end_ratio = end_ratio
 
@@ -99,35 +117,46 @@
       print 'end_ratio = %f' %self.end_ratio
 
     i_main = 0
-    i_other = 0
+    i_other1 = 0
+    i_other2 = 0
     i_batch = 0
 
     # compute the number of batches given start and end ratios
-    n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
-    n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    n_main_batch = (main_train_size - test_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    if (batch_size != batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)):
+      n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    else:
+      n_other_batch = n_main_batch
+
     n_batches = min([n_main_batch, n_other_batch])
 
     # train batches
     self.train_batches = []
 
     # as long as we have data left in main and other, we create batches
-    while i_main < main_train_size - batch_size - test_size  and i_other < other_train_size - batch_size:
-
+    while i_main < main_train_size - batch_size - test_size and i_other1 < other_train_size1 - batch_size and i_other2 < other_train_size2 - batch_size:
       ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches
-      batch_data = raw_main_train_data[0:self.batch_size]
-      batch_labels = raw_main_train_labels[0:self.batch_size]
+      batch_data = copy(raw_main_train_data[0:self.batch_size])
+      batch_labels = copy(raw_main_train_labels[0:self.batch_size])
 
       for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio
-	rnd = random.randint(0, 100)
+	rnd1 = random.randint(0, 100)
 
-	if rnd < 100 * ratio:
+	if rnd1 < 100 * ratio:
 	  batch_data[i] = raw_main_train_data[i_main]
 	  batch_labels[i] = raw_main_train_labels[i_main]
 	  i_main += 1
 	else:
-	  batch_data[i] = raw_other_train_data[i_other]
-	  batch_labels[i] = raw_other_train_labels[i_other] - 26 #to put values between 10 and 35 for lower case
-	  i_other += 1
+	  rnd2 = random.randint(0, 100)
+
+	  if rnd2 < 100 * float(other_train_size1) / float(other_train_size):
+	    batch_data[i] = raw_other_train_data1[i_other1]
+	    batch_labels[i] = raw_other_train_labels1[i_other1]
+	    i_other1 += 1
+	  else:
+	    batch_data[i] = raw_other_train_data2[i_other2]
+	    batch_labels[i] = raw_other_train_labels2[i_other2]
+	    i_other2 += 1
 
       self.train_batches = self.train_batches + \
 	      [(batch_data, batch_labels)]
@@ -143,13 +172,14 @@
 
     # validation batches
     self.validation_batches = []
-    for i in xrange(0, test_size, batch_size):
+    for i in xrange(0, validation_size, batch_size):
         self.validation_batches = self.validation_batches + \
             [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])]
 
     if verbose == True:
       print 'n_main = %d' %i_main
-      print 'n_other = %d' %i_other
+      print 'n_other1 = %d' %i_other1
+      print 'n_other2 = %d' %i_other2
       print 'nb_train_batches = %d / %d' %(i_batch,n_batches)
       print 'offset = %d' %offset