changeset 182:2b6a28e4cadc

J'ai reséparé NIST/OCR purs pour avoir des ensembles de test et de validation de 80000 plutôt que 20000, comme on a discuté au cours
author boulanni <nicolas_boulanger@hotmail.com>
date Sat, 27 Feb 2010 18:28:48 -0500
parents f0f47b045cbf
children 992ca8035a4d
files scripts/nist_divide.py scripts/ocr_divide.py
diffstat 2 files changed, 15 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/nist_divide.py	Sat Feb 27 17:10:37 2010 -0500
+++ b/scripts/nist_divide.py	Sat Feb 27 18:28:48 2010 -0500
@@ -3,8 +3,8 @@
 '''
 creation des ensembles train, valid et test NIST pur
 ensemble test est pris tel quel
-ensemble valid est trainorig[:20000]
-ensemble train est trainorig[20000:]
+ensemble valid est trainorig[:80000]
+ensemble train est trainorig[80000:]
 trainorig est deja shuffled
 '''
 
@@ -20,16 +20,16 @@
 f = open(dir1 + "/all_train_data.ft")
 d = ft.read(f)
 f = open(dir2 + "valid_data.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "train_data.ft", 'wb')
-ft.write(f, d[20000:])
+ft.write(f, d[80000:])
 
 f = open(dir1 + "/all_train_labels.ft")
 d = ft.read(f)
 f = open(dir2 + "valid_labels.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "train_labels.ft", 'wb')
-ft.write(f, d[20000:])
+ft.write(f, d[80000:])
 
 for i in ["train", "valid", "test"]:
     os.chmod(dir2 + i + "_data.ft", 0744)
--- a/scripts/ocr_divide.py	Sat Feb 27 17:10:37 2010 -0500
+++ b/scripts/ocr_divide.py	Sat Feb 27 18:28:48 2010 -0500
@@ -2,9 +2,9 @@
 
 '''
 creation des ensembles train, valid et test OCR
-ensemble valid est trainorig[:20000]
-ensemble test est trainorig[20000:40000]
-ensemble train est trainorig[40000:]
+ensemble valid est trainorig[:80000]
+ensemble test est trainorig[80000:160000]
+ensemble train est trainorig[160000:]
 trainorig est deja shuffled
 '''
 
@@ -17,20 +17,20 @@
 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft')
 d = ft.read(f)
 f = open(dir2 + "ocr_valid_data.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "ocr_test_data.ft", 'wb')
-ft.write(f, d[20000:40000])
+ft.write(f, d[80000:160000])
 f = open(dir2 + "ocr_train_data.ft", 'wb')
-ft.write(f, d[40000:])
+ft.write(f, d[160000:])
 
 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft')
 d = ft.read(f)
 f = open(dir2 + "ocr_valid_labels.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "ocr_test_labels.ft", 'wb')
-ft.write(f, d[20000:40000])
+ft.write(f, d[80000:160000])
 f = open(dir2 + "ocr_train_labels.ft", 'wb')
-ft.write(f, d[40000:])
+ft.write(f, d[160000:])
 
 for i in ["train", "valid", "test"]:
     os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744)