# HG changeset patch # User fsavard # Date 1267461959 18000 # Node ID c03692aa6158ae62d587f515832033087c8160ef # Parent d364a130b221ae2b518417afde80807ccc4b6c07# Parent 81f8466dc12110fe0866c7a0a61cdba2f58889f6 Merge diff -r d364a130b221 -r c03692aa6158 data_generation/pipeline/pipeline.py --- a/data_generation/pipeline/pipeline.py Mon Mar 01 11:45:25 2010 -0500 +++ b/data_generation/pipeline/pipeline.py Mon Mar 01 11:45:59 2010 -0500 @@ -8,7 +8,7 @@ import sys, os, getopt import numpy -import filetensor as ft +import ift6266.data_generation.transformations.filetensor as ft import random # To debug locally, also call with -s 100 (to stop after ~100) @@ -80,23 +80,21 @@ import pylab pylab.ion() -from ift6266.data_generation.transformations import * - -from PoivreSel import PoivreSel -from thick import Thick -from BruitGauss import BruitGauss -from DistorsionGauss import DistorsionGauss -from PermutPixel import PermutPixel -from gimp_script import GIMP1 -from Rature import Rature -from contrast import Contrast -from local_elastic_distortions import LocalElasticDistorter -from slant import Slant -from Occlusion import Occlusion -from add_background_image import AddBackground -from affine_transform import AffineTransformation -from ttf2jpg import ttf2jpg -from Facade import generateCaptcha +from ift6266.data_generation.transformations.PoivreSel import PoivreSel +from ift6266.data_generation.transformations.thick import Thick +from ift6266.data_generation.transformations.BruitGauss import BruitGauss +from ift6266.data_generation.transformations.DistorsionGauss import DistorsionGauss +from ift6266.data_generation.transformations.PermutPixel import PermutPixel +from ift6266.data_generation.transformations.gimp_script import GIMP1 +from ift6266.data_generation.transformations.Rature import Rature +from ift6266.data_generation.transformations.contrast import Contrast +from ift6266.data_generation.transformations.local_elastic_distortions import LocalElasticDistorter +from ift6266.data_generation.transformations.slant import Slant +from ift6266.data_generation.transformations.Occlusion import Occlusion +from ift6266.data_generation.transformations.add_background_image import AddBackground +from ift6266.data_generation.transformations.affine_transform import AffineTransformation +from ift6266.data_generation.transformations.ttf2jpg import ttf2jpg +from ift6266.data_generation.transformations.pycaptcha.Facade import generateCaptcha if DEBUG: from visualizer import Visualizer @@ -383,7 +381,10 @@ pl.run(img_it, cpx_it) pl.write_output(output_file_path, params_output_file_path, labels_output_file_path) -_main() +try: + _main() +except: + print "Unexpected error" if DEBUG_X: pylab.ioff() diff -r d364a130b221 -r c03692aa6158 data_generation/transformations/pycaptcha/Captcha/File.py --- a/data_generation/transformations/pycaptcha/Captcha/File.py Mon Mar 01 11:45:25 2010 -0500 +++ b/data_generation/transformations/pycaptcha/Captcha/File.py Mon Mar 01 11:45:59 2010 -0500 @@ -36,7 +36,10 @@ """From our given file list, find a list of full paths to files""" paths = [] for name in self.fileList: - path = os.path.join(dataDir, self.basePath, name) + if name[0] == '/': + path = name + else: + path = os.path.join(dataDir, self.basePath, name) if os.path.isdir(path): for content in os.listdir(path): if self._checkExtension(content): diff -r d364a130b221 -r c03692aa6158 data_generation/transformations/pycaptcha/Captcha/Visual/Text.py --- a/data_generation/transformations/pycaptcha/Captcha/Visual/Text.py Mon Mar 01 11:45:25 2010 -0500 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Text.py Mon Mar 01 11:45:59 2010 -0500 @@ -39,7 +39,7 @@ return (fileName, size) # Predefined font factories -defaultFontFactory = FontFactory(25, "allfonts") +defaultFontFactory = FontFactory(25, "/Tmp/allfonts") #defaultFontFactory = FontFactory((30, 40), "vera") class TextLayer(Visual.Layer): @@ -77,7 +77,17 @@ self.borderColor = borderColor def render(self, img): - font = ImageFont.truetype(*self.font) + + i=1 + while True: + try: + font = ImageFont.truetype(*self.font) + break + except: + print "try#", i, self.font + i += 1 + if i>10: raise + textSize = font.getsize(self.text) draw = ImageDraw.Draw(img) diff -r d364a130b221 -r c03692aa6158 data_generation/transformations/pycaptcha/Facade.py --- a/data_generation/transformations/pycaptcha/Facade.py Mon Mar 01 11:45:25 2010 -0500 +++ b/data_generation/transformations/pycaptcha/Facade.py Mon Mar 01 11:45:59 2010 -0500 @@ -1,6 +1,8 @@ #!/usr/bin/env python - - +import sys, os +curdir = os.path.dirname(__file__) +if curdir != '': + sys.path.append(curdir) from Captcha.Visual.Tests import PseudoGimpy, AngryGimpy import numpy diff -r d364a130b221 -r c03692aa6158 datasets/__init__.py --- a/datasets/__init__.py Mon Mar 01 11:45:25 2010 -0500 +++ b/datasets/__init__.py Mon Mar 01 11:45:59 2010 -0500 @@ -1,1 +1,1 @@ -from nist import * +from defs import * diff -r d364a130b221 -r c03692aa6158 datasets/dataset.py --- a/datasets/dataset.py Mon Mar 01 11:45:25 2010 -0500 +++ b/datasets/dataset.py Mon Mar 01 11:45:59 2010 -0500 @@ -6,8 +6,7 @@ Returns an iterator over the test examples. Parameters - batchsize (int) -- the size of the minibatches, 0 means - return the whole set at once. + batchsize (int) -- the size of the minibatches bufsize (int, optional) -- the size of the in-memory buffer, 0 to disable. """ @@ -18,8 +17,7 @@ Returns an iterator over the training examples. Parameters - batchsize (int) -- the size of the minibatches, 0 means - return the whole set at once. + batchsize (int) -- the size of the minibatches bufsize (int, optional) -- the size of the in-memory buffer, 0 to disable. """ @@ -30,8 +28,7 @@ Returns an iterator over the validation examples. Parameters - batchsize (int) -- the size of the minibatches, 0 means - return the whole set at once. + batchsize (int) -- the size of the minibatches bufsize (int, optional) -- the size of the in-memory buffer, 0 to disable. """ diff -r d364a130b221 -r c03692aa6158 datasets/defs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/defs.py Mon Mar 01 11:45:59 2010 -0500 @@ -0,0 +1,38 @@ +__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr'] + +from ftfile import FTDataSet +import theano + +NIST_PATH = '/data/lisa/data/nist/by_class/' +DATA_PATH = '/data/lisa/data/ift6266h10/' + +nist_digits = FTDataSet(train_data = [NIST_PATH+'digits/digits_train_data.ft'], + train_lbl = [NIST_PATH+'digits/digits_train_labels.ft'], + test_data = [NIST_PATH+'digits/digits_test_data.ft'], + test_lbl = [NIST_PATH+'digits/digits_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) +nist_lower = FTDataSet(train_data = [NIST_PATH+'lower/lower_train_data.ft'], + train_lbl = [NIST_PATH+'lower/lower_train_labels.ft'], + test_data = [NIST_PATH+'lower/lower_test_data.ft'], + test_lbl = [NIST_PATH+'lower/lower_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) +nist_upper = FTDataSet(train_data = [NIST_PATH+'upper/upper_train_data.ft'], + train_lbl = [NIST_PATH+'upper/upper_train_labels.ft'], + test_data = [NIST_PATH+'upper/upper_test_data.ft'], + test_lbl = [NIST_PATH+'upper/upper_test_labels.ft'], + indtype=theano.config.floatX, inscale=255.) + +nist_all = FTDataSet(train_data = [DATA_PATH+'train_data.ft'], + train_lbl = [DATA_PATH+'train_labels.ft'], + test_data = [DATA_PATH+'test_data.ft'], + test_lbl = [DATA_PATH+'test_labels.ft'], + valid_data = [DATA_PATH+'valid_data.ft'], + valid_lbl = [DATA_PATH+'valid_labels.ft'], + indtype=theano.config.floatX, inscale=255.) + +ocr = FTDataSet(train_data = [DATA_PATH+'ocr_train_data.ft'], + train_lbl = [DATA_PATH+'ocr_train_labels.ft'], + test_data = [DATA_PATH+'ocr_test_data.ft'], + test_lbl = [DATA_PATH+'ocr_test_labels.ft'], + valid_data = [DATA_PATH+'ocr_valid_data.ft'], + valid_lbl = [DATA_PATH+'ocr_valid_labels.ft']) diff -r d364a130b221 -r c03692aa6158 datasets/dsetiter.py --- a/datasets/dsetiter.py Mon Mar 01 11:45:25 2010 -0500 +++ b/datasets/dsetiter.py Mon Mar 01 11:45:59 2010 -0500 @@ -107,21 +107,23 @@ ... StopIteration """ + self.buffer = None if self.empty: raise StopIteration - self.buffer = self.curfile.read(self.bufsize) + buf = self.curfile.read(self.bufsize) - while len(self.buffer) < self.bufsize: + while len(buf) < self.bufsize: try: self.curfile = self.files.next() except StopIteration: self.empty = True - if len(self.buffer) == 0: - raise StopIteration - self.curpos = 0 - return - tmpbuf = self.curfile.read(self.bufsize - len(self.buffer)) - self.buffer = numpy.row_stack((self.buffer, tmpbuf)) + if len(buf) == 0: + raise + break + tmpbuf = self.curfile.read(self.bufsize - len(buf)) + buf = numpy.row_stack((buf, tmpbuf)) + + self.buffer = buf self.curpos = 0 def __next__(self): diff -r d364a130b221 -r c03692aa6158 datasets/ftfile.py --- a/datasets/ftfile.py Mon Mar 01 11:45:25 2010 -0500 +++ b/datasets/ftfile.py Mon Mar 01 11:45:59 2010 -0500 @@ -1,10 +1,11 @@ from pylearn.io.filetensor import _read_header, _prod -import numpy +import numpy, theano from dataset import DataSet from dsetiter import DataIterator +from itertools import izip, imap class FTFile(object): - def __init__(self, fname): + def __init__(self, fname, scale=1, dtype=None): r""" Tests: >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') @@ -12,11 +13,15 @@ self.file = open(fname, 'rb') self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False) self.size = self.dim[0] + self.scale = scale + self.dtype = dtype def skip(self, num): r""" Skips `num` items in the file. + If `num` is negative, skips size-num. + Tests: >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') >>> f.size @@ -30,7 +35,21 @@ 4020 >>> f.size 57646 + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + >>> f.size + 58646 + >>> f.file.tell() + 20 + >>> f.skip(-1000) + >>> f.file.tell() + 230604 + >>> f.size + 1000 """ + if num < 0: + num += self.size + if num < 0: + raise ValueError('Skipping past the start of the file') if num >= self.size: self.size = 0 else: @@ -62,18 +81,34 @@ num = self.size self.dim[0] = num self.size -= num - return numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) + res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) + if self.dtype is not None: + res = res.astype(self.dtype) + if self.scale != 1: + res /= self.scale + return res class FTSource(object): - def __init__(self, file, skip=0, size=None): + def __init__(self, file, skip=0, size=None, dtype=None, scale=1): r""" Create a data source from a possible subset of a .ft file. Parameters: `file` (string) -- the filename - `skip` (int, optional) -- amount of examples to skip from the start of the file - `size` (int, optional) -- truncates number of examples read (after skipping) - + `skip` (int, optional) -- amount of examples to skip from + the start of the file. If + negative, skips filesize - skip. + `size` (int, optional) -- truncates number of examples + read (after skipping). If + negative truncates to + filesize - size + (also after skipping). + `dtype` (dtype, optional) -- convert the data to this + dtype after reading. + `scale` (number, optional) -- scale (that is divide) the + data by this number (after + dtype conversion, if any). + Tests: >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) @@ -83,6 +118,8 @@ self.file = file self.skip = skip self.size = size + self.dtype = dtype + self.scale = scale def open(self): r""" @@ -100,21 +137,30 @@ >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) >>> s.open().size 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) + >>> s.open().size + 58636 """ - f = FTFile(self.file) + f = FTFile(self.file, scale=self.scale, dtype=self.dtype) if self.skip != 0: f.skip(self.skip) if self.size is not None and self.size < f.size: - f.size = self.size + if self.size < 0: + f.size += self.size + else: + f.size = self.size return f class FTData(object): r""" This is a list of FTSources. """ - def __init__(self, datafiles, labelfiles, skip=0, size=None): - self.inputs = [FTSource(f, skip, size) for f in datafiles] - self.outputs = [FTSource(f, skip, size) for f in labelfiles] + def __init__(self, datafiles, labelfiles, skip=0, size=None, + inscale=1, indtype=None, outscale=1, outdtype=None): + self.inputs = [FTSource(f, skip, size, scale=inscale, dtype=indtype) + for f in datafiles] + self.outputs = [FTSource(f, skip, size, scale=outscale, dtype=outdtype) + for f in labelfiles] def open_inputs(self): return [f.open() for f in self.inputs] @@ -124,7 +170,7 @@ class FTDataSet(DataSet): - def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None): + def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None, indtype=None, outdtype=None, inscale=1, outscale=1): r""" Defines a DataSet from a bunch of files. @@ -136,6 +182,9 @@ can differ from train. `valid_data`, `valid_labels` -- same thing again for validation. (optional) + `indtype`, `outdtype`, -- see FTSource.__init__() + `inscale`, `outscale` (optional) + If `valid_data` and `valid_labels` are not supplied then a sample approximately equal in size to the test set is taken from the train @@ -144,13 +193,13 @@ if valid_data is None: total_valid_size = sum(FTFile(td).size for td in test_data) valid_size = total_valid_size/len(train_data) - self._train = FTData(train_data, train_lbl, skip=valid_size) - self._valid = FTData(train_data, train_lbl, size=valid_size) + self._train = FTData(train_data, train_lbl, size=-valid_size) + self._valid = FTData(train_data, train_lbl, skip=-valid_size) else: self._train = FTData(train_data, train_lbl) self._valid = FTData(valid_data, valid_lbl) self._test = FTData(test_data, test_lbl) def _return_it(self, batchsize, bufsize, ftdata): - return zip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), - DataIterator(ftdata.open_outputs(), batchsize, bufsize)) + return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), + DataIterator(ftdata.open_outputs(), batchsize, bufsize)) diff -r d364a130b221 -r c03692aa6158 datasets/nist.py --- a/datasets/nist.py Mon Mar 01 11:45:25 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all'] - -from ftfile import FTDataSet - -PATH = '/data/lisa/data/nist/by_class/' - -nist_digits = FTDataSet(train_data = [PATH+'digits/digits_train_data.ft'], - train_lbl = [PATH+'digits/digits_train_labels.ft'], - test_data = [PATH+'digits/digits_test_data.ft'], - test_lbl = [PATH+'digits/digits_test_labels.ft']) -nist_lower = FTDataSet(train_data = [PATH+'lower/lower_train_data.ft'], - train_lbl = [PATH+'lower/lower_train_labels.ft'], - test_data = [PATH+'lower/lower_test_data.ft'], - test_lbl = [PATH+'lower/lower_test_labels.ft']) -nist_upper = FTDataSet(train_data = [PATH+'upper/upper_train_data.ft'], - train_lbl = [PATH+'upper/upper_train_labels.ft'], - test_data = [PATH+'upper/upper_test_data.ft'], - test_lbl = [PATH+'upper/upper_test_labels.ft']) -nist_all = FTDataSet(train_data = [PATH+'all/all_train_data.ft'], - train_lbl = [PATH+'all/all_train_labels.ft'], - test_data = [PATH+'all/all_test_data.ft'], - test_lbl = [PATH+'all/all_test_labels.ft']) - diff -r d364a130b221 -r c03692aa6158 scripts/nist_divide.py --- a/scripts/nist_divide.py Mon Mar 01 11:45:25 2010 -0500 +++ b/scripts/nist_divide.py Mon Mar 01 11:45:59 2010 -0500 @@ -3,8 +3,8 @@ ''' creation des ensembles train, valid et test NIST pur ensemble test est pris tel quel -ensemble valid est trainorig[:20000] -ensemble train est trainorig[20000:] +ensemble valid est trainorig[:80000] +ensemble train est trainorig[80000:] trainorig est deja shuffled ''' @@ -20,16 +20,16 @@ f = open(dir1 + "/all_train_data.ft") d = ft.read(f) f = open(dir2 + "valid_data.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "train_data.ft", 'wb') -ft.write(f, d[20000:]) +ft.write(f, d[80000:]) f = open(dir1 + "/all_train_labels.ft") d = ft.read(f) f = open(dir2 + "valid_labels.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "train_labels.ft", 'wb') -ft.write(f, d[20000:]) +ft.write(f, d[80000:]) for i in ["train", "valid", "test"]: os.chmod(dir2 + i + "_data.ft", 0744) diff -r d364a130b221 -r c03692aa6158 scripts/ocr_divide.py --- a/scripts/ocr_divide.py Mon Mar 01 11:45:25 2010 -0500 +++ b/scripts/ocr_divide.py Mon Mar 01 11:45:59 2010 -0500 @@ -2,9 +2,9 @@ ''' creation des ensembles train, valid et test OCR -ensemble valid est trainorig[:20000] -ensemble test est trainorig[20000:40000] -ensemble train est trainorig[40000:] +ensemble valid est trainorig[:80000] +ensemble test est trainorig[80000:160000] +ensemble train est trainorig[160000:] trainorig est deja shuffled ''' @@ -17,20 +17,20 @@ f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') d = ft.read(f) f = open(dir2 + "ocr_valid_data.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "ocr_test_data.ft", 'wb') -ft.write(f, d[20000:40000]) +ft.write(f, d[80000:160000]) f = open(dir2 + "ocr_train_data.ft", 'wb') -ft.write(f, d[40000:]) +ft.write(f, d[160000:]) f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') d = ft.read(f) f = open(dir2 + "ocr_valid_labels.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "ocr_test_labels.ft", 'wb') -ft.write(f, d[20000:40000]) +ft.write(f, d[80000:160000]) f = open(dir2 + "ocr_train_labels.ft", 'wb') -ft.write(f, d[40000:]) +ft.write(f, d[160000:]) for i in ["train", "valid", "test"]: os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) diff -r d364a130b221 -r c03692aa6158 scripts/run_pipeline.sh --- a/scripts/run_pipeline.sh Mon Mar 01 11:45:25 2010 -0500 +++ b/scripts/run_pipeline.sh Mon Mar 01 11:45:59 2010 -0500 @@ -15,6 +15,6 @@ echo $arg >> $PIPELINE_ARGS_TMPFILE done -gimp -i --batch-interpreter python-fu-eval --batch - < pipeline.py +gimp -i --batch-interpreter python-fu-eval --batch - < ../data_generation/pipeline/pipeline.py diff -r d364a130b221 -r c03692aa6158 test.py --- a/test.py Mon Mar 01 11:45:25 2010 -0500 +++ b/test.py Mon Mar 01 11:45:59 2010 -0500 @@ -5,8 +5,11 @@ predefs = ift6266.__dict__ for (_, name, ispkg) in pkgutil.walk_packages(ift6266.__path__, ift6266.__name__+'.'): if not ispkg: - if name.startswith('ift6266.scripts') or \ - name in ['ift6266.test']: + if name.startswith('ift6266.scripts.') or \ + name.startswith('ift6266.data_generation.transformations.pycaptcha.') or \ + name in ['ift6266.test', + 'ift6266.data_generation.transformations.testmod', + 'ift6266.data_generation.transformations.gimp_script']: continue print "Testing:", name __import__(name)