comparison scripts/ocr_divide.py @ 182:2b6a28e4cadc

J'ai reséparé NIST/OCR purs pour avoir des ensembles de test et de validation de 80000 plutôt que 20000, comme on a discuté au cours
author boulanni <nicolas_boulanger@hotmail.com>
date Sat, 27 Feb 2010 18:28:48 -0500
parents 728e232eaf45
children
comparison
equal deleted inserted replaced
181:f0f47b045cbf 182:2b6a28e4cadc
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 ''' 3 '''
4 creation des ensembles train, valid et test OCR 4 creation des ensembles train, valid et test OCR
5 ensemble valid est trainorig[:20000] 5 ensemble valid est trainorig[:80000]
6 ensemble test est trainorig[20000:40000] 6 ensemble test est trainorig[80000:160000]
7 ensemble train est trainorig[40000:] 7 ensemble train est trainorig[160000:]
8 trainorig est deja shuffled 8 trainorig est deja shuffled
9 ''' 9 '''
10 10
11 from pylearn.io import filetensor as ft 11 from pylearn.io import filetensor as ft
12 import numpy, os 12 import numpy, os
15 dir2 = "/data/lisa/data/ift6266h10/" 15 dir2 = "/data/lisa/data/ift6266h10/"
16 16
17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') 17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft')
18 d = ft.read(f) 18 d = ft.read(f)
19 f = open(dir2 + "ocr_valid_data.ft", 'wb') 19 f = open(dir2 + "ocr_valid_data.ft", 'wb')
20 ft.write(f, d[:20000]) 20 ft.write(f, d[:80000])
21 f = open(dir2 + "ocr_test_data.ft", 'wb') 21 f = open(dir2 + "ocr_test_data.ft", 'wb')
22 ft.write(f, d[20000:40000]) 22 ft.write(f, d[80000:160000])
23 f = open(dir2 + "ocr_train_data.ft", 'wb') 23 f = open(dir2 + "ocr_train_data.ft", 'wb')
24 ft.write(f, d[40000:]) 24 ft.write(f, d[160000:])
25 25
26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') 26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft')
27 d = ft.read(f) 27 d = ft.read(f)
28 f = open(dir2 + "ocr_valid_labels.ft", 'wb') 28 f = open(dir2 + "ocr_valid_labels.ft", 'wb')
29 ft.write(f, d[:20000]) 29 ft.write(f, d[:80000])
30 f = open(dir2 + "ocr_test_labels.ft", 'wb') 30 f = open(dir2 + "ocr_test_labels.ft", 'wb')
31 ft.write(f, d[20000:40000]) 31 ft.write(f, d[80000:160000])
32 f = open(dir2 + "ocr_train_labels.ft", 'wb') 32 f = open(dir2 + "ocr_train_labels.ft", 'wb')
33 ft.write(f, d[40000:]) 33 ft.write(f, d[160000:])
34 34
35 for i in ["train", "valid", "test"]: 35 for i in ["train", "valid", "test"]:
36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) 36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744)
37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) 37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744)
38 38