comparison scripts/ocr_divide.py @ 137:728e232eaf45

Added script to separate OCR data in train, validation and test sets (raw data)
author boulanni <nicolas_boulanger@hotmail.com>
date Sat, 20 Feb 2010 02:12:57 -0500
parents
children 2b6a28e4cadc
comparison
equal deleted inserted replaced
136:4a78c9b83fee 137:728e232eaf45
1 #!/usr/bin/env python
2
3 '''
4 creation des ensembles train, valid et test OCR
5 ensemble valid est trainorig[:20000]
6 ensemble test est trainorig[20000:40000]
7 ensemble train est trainorig[40000:]
8 trainorig est deja shuffled
9 '''
10
11 from pylearn.io import filetensor as ft
12 import numpy, os
13
14 dir1 = '/data/lisa/data/ocr_breuel/filetensor/'
15 dir2 = "/data/lisa/data/ift6266h10/"
16
17 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft')
18 d = ft.read(f)
19 f = open(dir2 + "ocr_valid_data.ft", 'wb')
20 ft.write(f, d[:20000])
21 f = open(dir2 + "ocr_test_data.ft", 'wb')
22 ft.write(f, d[20000:40000])
23 f = open(dir2 + "ocr_train_data.ft", 'wb')
24 ft.write(f, d[40000:])
25
26 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft')
27 d = ft.read(f)
28 f = open(dir2 + "ocr_valid_labels.ft", 'wb')
29 ft.write(f, d[:20000])
30 f = open(dir2 + "ocr_test_labels.ft", 'wb')
31 ft.write(f, d[20000:40000])
32 f = open(dir2 + "ocr_train_labels.ft", 'wb')
33 ft.write(f, d[40000:])
34
35 for i in ["train", "valid", "test"]:
36 os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744)
37 os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744)
38
39
40