Mercurial > ift6266
comparison scripts/ocr_divide.py @ 182:2b6a28e4cadc
J'ai reséparé NIST/OCR purs pour avoir des ensembles de test et de validation de 80000 plutôt que 20000, comme on a discuté au cours
author | boulanni <nicolas_boulanger@hotmail.com> |
---|---|
date | Sat, 27 Feb 2010 18:28:48 -0500 |
parents | 728e232eaf45 |
children |
comparison
equal
deleted
inserted
replaced
181:f0f47b045cbf | 182:2b6a28e4cadc |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 ''' | 3 ''' |
4 creation des ensembles train, valid et test OCR | 4 creation des ensembles train, valid et test OCR |
5 ensemble valid est trainorig[:20000] | 5 ensemble valid est trainorig[:80000] |
6 ensemble test est trainorig[20000:40000] | 6 ensemble test est trainorig[80000:160000] |
7 ensemble train est trainorig[40000:] | 7 ensemble train est trainorig[160000:] |
8 trainorig est deja shuffled | 8 trainorig est deja shuffled |
9 ''' | 9 ''' |
10 | 10 |
11 from pylearn.io import filetensor as ft | 11 from pylearn.io import filetensor as ft |
12 import numpy, os | 12 import numpy, os |
15 dir2 = "/data/lisa/data/ift6266h10/" | 15 dir2 = "/data/lisa/data/ift6266h10/" |
16 | 16 |
17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') | 17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') |
18 d = ft.read(f) | 18 d = ft.read(f) |
19 f = open(dir2 + "ocr_valid_data.ft", 'wb') | 19 f = open(dir2 + "ocr_valid_data.ft", 'wb') |
20 ft.write(f, d[:20000]) | 20 ft.write(f, d[:80000]) |
21 f = open(dir2 + "ocr_test_data.ft", 'wb') | 21 f = open(dir2 + "ocr_test_data.ft", 'wb') |
22 ft.write(f, d[20000:40000]) | 22 ft.write(f, d[80000:160000]) |
23 f = open(dir2 + "ocr_train_data.ft", 'wb') | 23 f = open(dir2 + "ocr_train_data.ft", 'wb') |
24 ft.write(f, d[40000:]) | 24 ft.write(f, d[160000:]) |
25 | 25 |
26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') | 26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') |
27 d = ft.read(f) | 27 d = ft.read(f) |
28 f = open(dir2 + "ocr_valid_labels.ft", 'wb') | 28 f = open(dir2 + "ocr_valid_labels.ft", 'wb') |
29 ft.write(f, d[:20000]) | 29 ft.write(f, d[:80000]) |
30 f = open(dir2 + "ocr_test_labels.ft", 'wb') | 30 f = open(dir2 + "ocr_test_labels.ft", 'wb') |
31 ft.write(f, d[20000:40000]) | 31 ft.write(f, d[80000:160000]) |
32 f = open(dir2 + "ocr_train_labels.ft", 'wb') | 32 f = open(dir2 + "ocr_train_labels.ft", 'wb') |
33 ft.write(f, d[40000:]) | 33 ft.write(f, d[160000:]) |
34 | 34 |
35 for i in ["train", "valid", "test"]: | 35 for i in ["train", "valid", "test"]: |
36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) | 36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) |
37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) | 37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) |
38 | 38 |