changeset 187:c03692aa6158

Merge
author fsavard
date Mon, 01 Mar 2010 11:45:59 -0500
parents d364a130b221 (current diff) 81f8466dc121 (diff)
children e98f6b855a7f
files datasets/nist.py
diffstat 14 files changed, 179 insertions(+), 97 deletions(-) [+]
line wrap: on
line diff
--- a/data_generation/pipeline/pipeline.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/data_generation/pipeline/pipeline.py	Mon Mar 01 11:45:59 2010 -0500
@@ -8,7 +8,7 @@
 
 import sys, os, getopt
 import numpy
-import filetensor as ft
+import ift6266.data_generation.transformations.filetensor as ft
 import random
 
 # To debug locally, also call with -s 100 (to stop after ~100)
@@ -80,23 +80,21 @@
     import pylab
     pylab.ion()
 
-from ift6266.data_generation.transformations import *
-
-from PoivreSel import PoivreSel
-from thick import Thick
-from BruitGauss import BruitGauss
-from DistorsionGauss import DistorsionGauss
-from PermutPixel import PermutPixel
-from gimp_script import GIMP1
-from Rature import Rature
-from contrast import Contrast
-from local_elastic_distortions import LocalElasticDistorter
-from slant import Slant
-from Occlusion import Occlusion
-from add_background_image import AddBackground
-from affine_transform import AffineTransformation
-from ttf2jpg import ttf2jpg
-from Facade import generateCaptcha
+from ift6266.data_generation.transformations.PoivreSel import PoivreSel
+from ift6266.data_generation.transformations.thick import Thick
+from ift6266.data_generation.transformations.BruitGauss import BruitGauss
+from ift6266.data_generation.transformations.DistorsionGauss import DistorsionGauss
+from ift6266.data_generation.transformations.PermutPixel import PermutPixel
+from ift6266.data_generation.transformations.gimp_script import GIMP1
+from ift6266.data_generation.transformations.Rature import Rature
+from ift6266.data_generation.transformations.contrast import Contrast
+from ift6266.data_generation.transformations.local_elastic_distortions import LocalElasticDistorter
+from ift6266.data_generation.transformations.slant import Slant
+from ift6266.data_generation.transformations.Occlusion import Occlusion
+from ift6266.data_generation.transformations.add_background_image import AddBackground
+from ift6266.data_generation.transformations.affine_transform import AffineTransformation
+from ift6266.data_generation.transformations.ttf2jpg import ttf2jpg
+from ift6266.data_generation.transformations.pycaptcha.Facade import generateCaptcha
 
 if DEBUG:
     from visualizer import Visualizer
@@ -383,7 +381,10 @@
         pl.run(img_it, cpx_it)
         pl.write_output(output_file_path, params_output_file_path, labels_output_file_path)
 
-_main()
+try:
+    _main()
+except:
+    print "Unexpected error"
 
 if DEBUG_X:
     pylab.ioff()
--- a/data_generation/transformations/pycaptcha/Captcha/File.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/data_generation/transformations/pycaptcha/Captcha/File.py	Mon Mar 01 11:45:59 2010 -0500
@@ -36,7 +36,10 @@
         """From our given file list, find a list of full paths to files"""
         paths = []
         for name in self.fileList:
-            path = os.path.join(dataDir, self.basePath, name)
+            if name[0] == '/':
+                path = name
+            else:
+                path = os.path.join(dataDir, self.basePath, name)
             if os.path.isdir(path):
                 for content in os.listdir(path):
                     if self._checkExtension(content):
--- a/data_generation/transformations/pycaptcha/Captcha/Visual/Text.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Text.py	Mon Mar 01 11:45:59 2010 -0500
@@ -39,7 +39,7 @@
         return (fileName, size)
 
 # Predefined font factories
-defaultFontFactory = FontFactory(25, "allfonts")
+defaultFontFactory = FontFactory(25, "/Tmp/allfonts")
 #defaultFontFactory = FontFactory((30, 40), "vera")
 
 class TextLayer(Visual.Layer):
@@ -77,7 +77,17 @@
         self.borderColor = borderColor
 
     def render(self, img):
-        font = ImageFont.truetype(*self.font)
+
+        i=1
+        while True:
+            try:
+                font = ImageFont.truetype(*self.font)
+                break
+            except:
+                print "try#", i, self.font
+                i += 1
+                if i>10: raise
+
     	textSize = font.getsize(self.text)
         draw = ImageDraw.Draw(img)
 
--- a/data_generation/transformations/pycaptcha/Facade.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/data_generation/transformations/pycaptcha/Facade.py	Mon Mar 01 11:45:59 2010 -0500
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
-
-
+import sys, os
+curdir = os.path.dirname(__file__)
+if curdir != '':
+    sys.path.append(curdir)
 
 from Captcha.Visual.Tests import PseudoGimpy, AngryGimpy
 import numpy
--- a/datasets/__init__.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/datasets/__init__.py	Mon Mar 01 11:45:59 2010 -0500
@@ -1,1 +1,1 @@
-from nist import *
+from defs import *
--- a/datasets/dataset.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/datasets/dataset.py	Mon Mar 01 11:45:59 2010 -0500
@@ -6,8 +6,7 @@
         Returns an iterator over the test examples.
 
         Parameters
-          batchsize (int) -- the size of the minibatches, 0 means
-                             return the whole set at once.
+          batchsize (int) -- the size of the minibatches
           bufsize (int, optional) -- the size of the in-memory buffer,
                                      0 to disable.
         """
@@ -18,8 +17,7 @@
         Returns an iterator over the training examples.
 
         Parameters
-          batchsize (int) -- the size of the minibatches, 0 means
-                             return the whole set at once.
+          batchsize (int) -- the size of the minibatches
           bufsize (int, optional) -- the size of the in-memory buffer,
                                      0 to disable.
         """
@@ -30,8 +28,7 @@
         Returns an iterator over the validation examples.
 
         Parameters
-          batchsize (int) -- the size of the minibatches, 0 means
-                             return the whole set at once.
+          batchsize (int) -- the size of the minibatches
           bufsize (int, optional) -- the size of the in-memory buffer,
                                      0 to disable.
         """
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets/defs.py	Mon Mar 01 11:45:59 2010 -0500
@@ -0,0 +1,38 @@
+__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr']
+
+from ftfile import FTDataSet
+import theano
+
+NIST_PATH = '/data/lisa/data/nist/by_class/'
+DATA_PATH = '/data/lisa/data/ift6266h10/'
+
+nist_digits = FTDataSet(train_data = [NIST_PATH+'digits/digits_train_data.ft'],
+                        train_lbl = [NIST_PATH+'digits/digits_train_labels.ft'],
+                        test_data = [NIST_PATH+'digits/digits_test_data.ft'],
+                        test_lbl = [NIST_PATH+'digits/digits_test_labels.ft'],
+                        indtype=theano.config.floatX, inscale=255.)
+nist_lower = FTDataSet(train_data = [NIST_PATH+'lower/lower_train_data.ft'],
+                        train_lbl = [NIST_PATH+'lower/lower_train_labels.ft'],
+                        test_data = [NIST_PATH+'lower/lower_test_data.ft'],
+                        test_lbl = [NIST_PATH+'lower/lower_test_labels.ft'],
+                        indtype=theano.config.floatX, inscale=255.)
+nist_upper = FTDataSet(train_data = [NIST_PATH+'upper/upper_train_data.ft'],
+                        train_lbl = [NIST_PATH+'upper/upper_train_labels.ft'],
+                        test_data = [NIST_PATH+'upper/upper_test_data.ft'],
+                        test_lbl = [NIST_PATH+'upper/upper_test_labels.ft'],
+                        indtype=theano.config.floatX, inscale=255.)
+
+nist_all = FTDataSet(train_data = [DATA_PATH+'train_data.ft'],
+                     train_lbl = [DATA_PATH+'train_labels.ft'],
+                     test_data = [DATA_PATH+'test_data.ft'],
+                     test_lbl = [DATA_PATH+'test_labels.ft'],
+                     valid_data = [DATA_PATH+'valid_data.ft'],
+                     valid_lbl = [DATA_PATH+'valid_labels.ft'],
+                     indtype=theano.config.floatX, inscale=255.)
+
+ocr = FTDataSet(train_data = [DATA_PATH+'ocr_train_data.ft'],
+                train_lbl = [DATA_PATH+'ocr_train_labels.ft'],
+                test_data = [DATA_PATH+'ocr_test_data.ft'],
+                test_lbl = [DATA_PATH+'ocr_test_labels.ft'],
+                valid_data = [DATA_PATH+'ocr_valid_data.ft'],
+                valid_lbl = [DATA_PATH+'ocr_valid_labels.ft'])
--- a/datasets/dsetiter.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/datasets/dsetiter.py	Mon Mar 01 11:45:59 2010 -0500
@@ -107,21 +107,23 @@
               ...
             StopIteration
         """
+        self.buffer = None
         if self.empty:
             raise StopIteration
-        self.buffer = self.curfile.read(self.bufsize)
+        buf = self.curfile.read(self.bufsize)
         
-        while len(self.buffer) < self.bufsize:
+        while len(buf) < self.bufsize:
             try:
                 self.curfile = self.files.next()
             except StopIteration:
                 self.empty = True
-                if len(self.buffer) == 0:
-                    raise StopIteration
-                self.curpos = 0
-                return
-            tmpbuf = self.curfile.read(self.bufsize - len(self.buffer))
-            self.buffer = numpy.row_stack((self.buffer, tmpbuf))
+                if len(buf) == 0:
+                    raise
+                break
+            tmpbuf = self.curfile.read(self.bufsize - len(buf))
+            buf = numpy.row_stack((buf, tmpbuf))
+
+        self.buffer = buf
         self.curpos = 0
 
     def __next__(self):
--- a/datasets/ftfile.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/datasets/ftfile.py	Mon Mar 01 11:45:59 2010 -0500
@@ -1,10 +1,11 @@
 from pylearn.io.filetensor import _read_header, _prod
-import numpy
+import numpy, theano
 from dataset import DataSet
 from dsetiter import DataIterator
+from itertools import izip, imap
 
 class FTFile(object):
-    def __init__(self, fname):
+    def __init__(self, fname, scale=1, dtype=None):
         r"""
         Tests:
             >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
@@ -12,11 +13,15 @@
         self.file = open(fname, 'rb')
         self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False)
         self.size = self.dim[0]
+        self.scale = scale
+        self.dtype = dtype
 
     def skip(self, num):
         r"""
         Skips `num` items in the file.
 
+        If `num` is negative, skips size-num.
+
         Tests:
             >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
             >>> f.size
@@ -30,7 +35,21 @@
             4020
             >>> f.size
             57646
+            >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
+            >>> f.size
+            58646
+            >>> f.file.tell()
+            20
+            >>> f.skip(-1000)
+            >>> f.file.tell()
+            230604
+            >>> f.size
+            1000
         """
+        if num < 0:
+            num += self.size
+        if num < 0:
+            raise ValueError('Skipping past the start of the file')
         if num >= self.size:
             self.size = 0
         else:
@@ -62,18 +81,34 @@
             num = self.size
         self.dim[0] = num
         self.size -= num
-        return numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
+        res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
+        if self.dtype is not None:
+            res = res.astype(self.dtype)
+        if self.scale != 1:
+            res /= self.scale
+        return res
 
 class FTSource(object):
-    def __init__(self, file, skip=0, size=None):
+    def __init__(self, file, skip=0, size=None, dtype=None, scale=1):
         r"""
         Create a data source from a possible subset of a .ft file.
 
         Parameters:
             `file` (string) -- the filename
-            `skip` (int, optional) -- amount of examples to skip from the start of the file
-            `size` (int, optional) -- truncates number of examples read (after skipping)
-        
+            `skip` (int, optional) -- amount of examples to skip from
+                                      the start of the file.  If
+                                      negative, skips filesize - skip.
+            `size` (int, optional) -- truncates number of examples
+                                      read (after skipping).  If
+                                      negative truncates to 
+                                      filesize - size 
+                                      (also after skipping).
+            `dtype` (dtype, optional) -- convert the data to this
+                                         dtype after reading.
+            `scale` (number, optional) -- scale (that is divide) the
+                                          data by this number (after
+                                          dtype conversion, if any).
+
         Tests:
            >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft')
            >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000)
@@ -83,6 +118,8 @@
         self.file = file
         self.skip = skip
         self.size = size
+        self.dtype = dtype
+        self.scale = scale
     
     def open(self):
         r"""
@@ -100,21 +137,30 @@
            >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1)
            >>> s.open().size
            1
+           >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10)
+           >>> s.open().size
+           58636
         """
-        f = FTFile(self.file)
+        f = FTFile(self.file, scale=self.scale, dtype=self.dtype)
         if self.skip != 0:
             f.skip(self.skip)
         if self.size is not None and self.size < f.size:
-            f.size = self.size
+            if self.size < 0:
+                f.size += self.size
+            else:
+                f.size = self.size
         return f
 
 class FTData(object):
     r"""
     This is a list of FTSources.
     """
-    def __init__(self, datafiles, labelfiles, skip=0, size=None):
-        self.inputs = [FTSource(f, skip, size) for f in  datafiles]
-        self.outputs = [FTSource(f, skip, size) for f in labelfiles]
+    def __init__(self, datafiles, labelfiles, skip=0, size=None,
+                 inscale=1, indtype=None, outscale=1, outdtype=None):
+        self.inputs = [FTSource(f, skip, size, scale=inscale, dtype=indtype)
+                       for f in  datafiles]
+        self.outputs = [FTSource(f, skip, size, scale=outscale, dtype=outdtype)
+                        for f in labelfiles]
 
     def open_inputs(self):
         return [f.open() for f in self.inputs]
@@ -124,7 +170,7 @@
     
 
 class FTDataSet(DataSet):
-    def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None):
+    def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None, indtype=None, outdtype=None, inscale=1, outscale=1):
         r"""
         Defines a DataSet from a bunch of files.
         
@@ -136,6 +182,9 @@
                                          can differ from train.
            `valid_data`, `valid_labels` -- same thing again for validation.
                                            (optional)
+           `indtype`, `outdtype`,  -- see FTSource.__init__()
+           `inscale`, `outscale`      (optional)
+                                                             
 
         If `valid_data` and `valid_labels` are not supplied then a sample
         approximately equal in size to the test set is taken from the train 
@@ -144,13 +193,13 @@
         if valid_data is None:
             total_valid_size = sum(FTFile(td).size for td in test_data)
             valid_size = total_valid_size/len(train_data)
-            self._train = FTData(train_data, train_lbl, skip=valid_size)
-            self._valid = FTData(train_data, train_lbl, size=valid_size)
+            self._train = FTData(train_data, train_lbl, size=-valid_size)
+            self._valid = FTData(train_data, train_lbl, skip=-valid_size)
         else:
             self._train = FTData(train_data, train_lbl)
             self._valid = FTData(valid_data, valid_lbl)
         self._test = FTData(test_data, test_lbl)
 
     def _return_it(self, batchsize, bufsize, ftdata):
-        return zip(DataIterator(ftdata.open_inputs(), batchsize, bufsize),
-                   DataIterator(ftdata.open_outputs(), batchsize, bufsize))
+        return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize),
+                    DataIterator(ftdata.open_outputs(), batchsize, bufsize))
--- a/datasets/nist.py	Mon Mar 01 11:45:25 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all']
-
-from ftfile import FTDataSet
-
-PATH = '/data/lisa/data/nist/by_class/'
-
-nist_digits = FTDataSet(train_data = [PATH+'digits/digits_train_data.ft'],
-                        train_lbl = [PATH+'digits/digits_train_labels.ft'],
-                        test_data = [PATH+'digits/digits_test_data.ft'],
-                        test_lbl = [PATH+'digits/digits_test_labels.ft'])
-nist_lower = FTDataSet(train_data = [PATH+'lower/lower_train_data.ft'],
-                        train_lbl = [PATH+'lower/lower_train_labels.ft'],
-                        test_data = [PATH+'lower/lower_test_data.ft'],
-                        test_lbl = [PATH+'lower/lower_test_labels.ft'])
-nist_upper = FTDataSet(train_data = [PATH+'upper/upper_train_data.ft'],
-                        train_lbl = [PATH+'upper/upper_train_labels.ft'],
-                        test_data = [PATH+'upper/upper_test_data.ft'],
-                        test_lbl = [PATH+'upper/upper_test_labels.ft'])
-nist_all = FTDataSet(train_data = [PATH+'all/all_train_data.ft'],
-                        train_lbl = [PATH+'all/all_train_labels.ft'],
-                        test_data = [PATH+'all/all_test_data.ft'],
-                        test_lbl = [PATH+'all/all_test_labels.ft'])
-
--- a/scripts/nist_divide.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/scripts/nist_divide.py	Mon Mar 01 11:45:59 2010 -0500
@@ -3,8 +3,8 @@
 '''
 creation des ensembles train, valid et test NIST pur
 ensemble test est pris tel quel
-ensemble valid est trainorig[:20000]
-ensemble train est trainorig[20000:]
+ensemble valid est trainorig[:80000]
+ensemble train est trainorig[80000:]
 trainorig est deja shuffled
 '''
 
@@ -20,16 +20,16 @@
 f = open(dir1 + "/all_train_data.ft")
 d = ft.read(f)
 f = open(dir2 + "valid_data.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "train_data.ft", 'wb')
-ft.write(f, d[20000:])
+ft.write(f, d[80000:])
 
 f = open(dir1 + "/all_train_labels.ft")
 d = ft.read(f)
 f = open(dir2 + "valid_labels.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "train_labels.ft", 'wb')
-ft.write(f, d[20000:])
+ft.write(f, d[80000:])
 
 for i in ["train", "valid", "test"]:
     os.chmod(dir2 + i + "_data.ft", 0744)
--- a/scripts/ocr_divide.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/scripts/ocr_divide.py	Mon Mar 01 11:45:59 2010 -0500
@@ -2,9 +2,9 @@
 
 '''
 creation des ensembles train, valid et test OCR
-ensemble valid est trainorig[:20000]
-ensemble test est trainorig[20000:40000]
-ensemble train est trainorig[40000:]
+ensemble valid est trainorig[:80000]
+ensemble test est trainorig[80000:160000]
+ensemble train est trainorig[160000:]
 trainorig est deja shuffled
 '''
 
@@ -17,20 +17,20 @@
 f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft')
 d = ft.read(f)
 f = open(dir2 + "ocr_valid_data.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "ocr_test_data.ft", 'wb')
-ft.write(f, d[20000:40000])
+ft.write(f, d[80000:160000])
 f = open(dir2 + "ocr_train_data.ft", 'wb')
-ft.write(f, d[40000:])
+ft.write(f, d[160000:])
 
 f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft')
 d = ft.read(f)
 f = open(dir2 + "ocr_valid_labels.ft", 'wb')
-ft.write(f, d[:20000])
+ft.write(f, d[:80000])
 f = open(dir2 + "ocr_test_labels.ft", 'wb')
-ft.write(f, d[20000:40000])
+ft.write(f, d[80000:160000])
 f = open(dir2 + "ocr_train_labels.ft", 'wb')
-ft.write(f, d[40000:])
+ft.write(f, d[160000:])
 
 for i in ["train", "valid", "test"]:
     os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744)
--- a/scripts/run_pipeline.sh	Mon Mar 01 11:45:25 2010 -0500
+++ b/scripts/run_pipeline.sh	Mon Mar 01 11:45:59 2010 -0500
@@ -15,6 +15,6 @@
 	echo $arg >> $PIPELINE_ARGS_TMPFILE
 done
 
-gimp -i --batch-interpreter python-fu-eval --batch - < pipeline.py
+gimp -i --batch-interpreter python-fu-eval --batch - < ../data_generation/pipeline/pipeline.py
 
 
--- a/test.py	Mon Mar 01 11:45:25 2010 -0500
+++ b/test.py	Mon Mar 01 11:45:59 2010 -0500
@@ -5,8 +5,11 @@
     predefs = ift6266.__dict__
     for (_, name, ispkg) in pkgutil.walk_packages(ift6266.__path__, ift6266.__name__+'.'):
         if not ispkg:
-            if name.startswith('ift6266.scripts') or \
-               name in ['ift6266.test']:
+            if name.startswith('ift6266.scripts.') or \
+               name.startswith('ift6266.data_generation.transformations.pycaptcha.') or \
+               name in ['ift6266.test',
+                        'ift6266.data_generation.transformations.testmod',
+                        'ift6266.data_generation.transformations.gimp_script']:
                 continue
             print "Testing:", name
             __import__(name)