Mercurial > ift6266
view scripts/ocr_divide.py @ 220:e172ef73cdc5
Ajouté un paquet de type/value checks à SeriesTables, et finalisé les docstrings. Ajouté 3-4 tests. Légers refactorings ici et là sans conséquences externes.
author | fsavard |
---|---|
date | Thu, 11 Mar 2010 10:48:54 -0500 |
parents | 2b6a28e4cadc |
children |
line wrap: on
line source
#!/usr/bin/env python ''' creation des ensembles train, valid et test OCR ensemble valid est trainorig[:80000] ensemble test est trainorig[80000:160000] ensemble train est trainorig[160000:] trainorig est deja shuffled ''' from pylearn.io import filetensor as ft import numpy, os dir1 = '/data/lisa/data/ocr_breuel/filetensor/' dir2 = "/data/lisa/data/ift6266h10/" f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') d = ft.read(f) f = open(dir2 + "ocr_valid_data.ft", 'wb') ft.write(f, d[:80000]) f = open(dir2 + "ocr_test_data.ft", 'wb') ft.write(f, d[80000:160000]) f = open(dir2 + "ocr_train_data.ft", 'wb') ft.write(f, d[160000:]) f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') d = ft.read(f) f = open(dir2 + "ocr_valid_labels.ft", 'wb') ft.write(f, d[:80000]) f = open(dir2 + "ocr_test_labels.ft", 'wb') ft.write(f, d[80000:160000]) f = open(dir2 + "ocr_train_labels.ft", 'wb') ft.write(f, d[160000:]) for i in ["train", "valid", "test"]: os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744)