Mercurial > ift6266
comparison scripts/ocr_divide.py @ 137:728e232eaf45
Added script to separate OCR data in train, validation and test sets (raw data)
author | boulanni <nicolas_boulanger@hotmail.com> |
---|---|
date | Sat, 20 Feb 2010 02:12:57 -0500 |
parents | |
children | 2b6a28e4cadc |
comparison
equal
deleted
inserted
replaced
136:4a78c9b83fee | 137:728e232eaf45 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ''' | |
4 creation des ensembles train, valid et test OCR | |
5 ensemble valid est trainorig[:20000] | |
6 ensemble test est trainorig[20000:40000] | |
7 ensemble train est trainorig[40000:] | |
8 trainorig est deja shuffled | |
9 ''' | |
10 | |
11 from pylearn.io import filetensor as ft | |
12 import numpy, os | |
13 | |
14 dir1 = '/data/lisa/data/ocr_breuel/filetensor/' | |
15 dir2 = "/data/lisa/data/ift6266h10/" | |
16 | |
17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') | |
18 d = ft.read(f) | |
19 f = open(dir2 + "ocr_valid_data.ft", 'wb') | |
20 ft.write(f, d[:20000]) | |
21 f = open(dir2 + "ocr_test_data.ft", 'wb') | |
22 ft.write(f, d[20000:40000]) | |
23 f = open(dir2 + "ocr_train_data.ft", 'wb') | |
24 ft.write(f, d[40000:]) | |
25 | |
26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') | |
27 d = ft.read(f) | |
28 f = open(dir2 + "ocr_valid_labels.ft", 'wb') | |
29 ft.write(f, d[:20000]) | |
30 f = open(dir2 + "ocr_test_labels.ft", 'wb') | |
31 ft.write(f, d[20000:40000]) | |
32 f = open(dir2 + "ocr_train_labels.ft", 'wb') | |
33 ft.write(f, d[40000:]) | |
34 | |
35 for i in ["train", "valid", "test"]: | |
36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) | |
37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) | |
38 | |
39 | |
40 |