Mercurial > ift6266
changeset 137:728e232eaf45
Added script to separate OCR data in train, validation and test sets (raw data)
author | boulanni <nicolas_boulanger@hotmail.com> |
---|---|
date | Sat, 20 Feb 2010 02:12:57 -0500 |
parents | 4a78c9b83fee |
children | 128507ac4edf |
files | scripts/ocr_divide.py |
diffstat | 1 files changed, 40 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/ocr_divide.py Sat Feb 20 02:12:57 2010 -0500 @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +''' +creation des ensembles train, valid et test OCR +ensemble valid est trainorig[:20000] +ensemble test est trainorig[20000:40000] +ensemble train est trainorig[40000:] +trainorig est deja shuffled +''' + +from pylearn.io import filetensor as ft +import numpy, os + +dir1 = '/data/lisa/data/ocr_breuel/filetensor/' +dir2 = "/data/lisa/data/ift6266h10/" + +f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') +d = ft.read(f) +f = open(dir2 + "ocr_valid_data.ft", 'wb') +ft.write(f, d[:20000]) +f = open(dir2 + "ocr_test_data.ft", 'wb') +ft.write(f, d[20000:40000]) +f = open(dir2 + "ocr_train_data.ft", 'wb') +ft.write(f, d[40000:]) + +f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') +d = ft.read(f) +f = open(dir2 + "ocr_valid_labels.ft", 'wb') +ft.write(f, d[:20000]) +f = open(dir2 + "ocr_test_labels.ft", 'wb') +ft.write(f, d[20000:40000]) +f = open(dir2 + "ocr_train_labels.ft", 'wb') +ft.write(f, d[40000:]) + +for i in ["train", "valid", "test"]: + os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) + os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) + + +