# HG changeset patch # User fsavard # Date 1265234907 18000 # Node ID fdb0e0870fb4b491b204dd420a258ba70532cba2 # Parent 0f13379947168d2e400a050bbaec50431da07b71 Beaucoup de modifications à pipeline.py pour généraliser et un début de visualisation, et créé un wrapper (run_pipeline.py) pour appeler avec GIMP. - Modifications à pipeline.py - Wrappé la boucle du pipeline dans une classe - Isolé le problème de itérer sur les batches et les complexités dans des itérateurs - Permet d'avoir des ordres compliqués de batch (plusieurs sources), de complexités - Maintenant regenerate_parameters() est appelé pour chaque image. - Command line arguments avec getopt(). On pourra rajouter des options ainsi. - run_pipeline.py - Le but est de permettre de passer des arguments. Pas facile (pas trouvé comment de façon simple) avec la command line pour appeler GIMP en mode batch. C'est un hack ici. - Le but ultime est de permettre de lancer les jobs sur les clusters avec dbidispatch en précisant les options (diff. pour chaque job) sur la ligne de commande. diff -r 0f1337994716 -r fdb0e0870fb4 transformations/pipeline.py --- a/transformations/pipeline.py Wed Feb 03 16:42:59 2010 -0500 +++ b/transformations/pipeline.py Wed Feb 03 17:08:27 2010 -0500 @@ -1,57 +1,208 @@ +#!/usr/bin/python +# coding: utf-8 + from __future__ import with_statement -import sys, os +import sys, os, getopt import numpy import filetensor as ft import random -BATCH_SIZE = 100 +# This is intended to be run as a GIMP script +from gimpfu import * -#import and stuff them in mods below +DEBUG = True +BATCH_SIZE = 100 +DEFAULT_NIST_PATH = '/data/lisa/data/nist/by_class/all/all_train_data.ft' +ARGS_FILE = os.environ['PIPELINE_ARGS_TMPFILE'] + +if DEBUG: + import pylab + pylab.ion() -mods = [] +#from add_background_image import AddBackground +#from affine_transform import AffineTransformation +#from PoivreSel import PoivreSel +from thick import Thick +#from BruitGauss import BruitGauss +#from gimp_script import GIMPTransformation +#from Rature import Rature +#from contrast Contrast +from local_elastic_distortions import LocalElasticDistorter +from slant import Slant + +MODULE_INSTANCES = [Thick(), LocalElasticDistorter(), Slant()] -# DANGER: HIGH VOLTAGE -- DO NOT EDIT BELOW THIS LINE -# ----------------------------------------------------------- +class Pipeline(): + def __init__(self, modules, num_batches, batch_size, image_size=(32,32)): + self.modules = modules + self.num_batches = num_batches + self.batch_size = batch_size + self.num_params_stored = 0 + self.image_size = image_size + + self.init_memory() + + def init_num_params_stored(self): + # just a dummy call to regenerate_parameters() to get the + # real number of params (only those which are stored) + self.num_params_stored = 0 + for m in self.modules: + self.num_params_stored += len(m.regenerate_parameters(0.0)) + + def init_memory(self): + self.init_num_params_stored() -outf = sys.argv[1] -paramsf = sys.argv[2] -dataf = '/data/lisa/data/nist/by_class/all/all_train_data.ft' -if len(sys.argv) >= 4: - dataf = sys.argv[3] + total = (self.num_batches + 1) * self.batch_size + num_px = self.image_size[0] * self.image_size[1] + + self.res_data = numpy.empty((total, num_px)) + self.params = numpy.empty((total, self.num_params_stored)) + + def run(self, batch_iterator, complexity_iterator): + img_size = self.image_size + + for batch_no, batch in enumerate(batch_iterator): + complexity = complexity_iterator.next() + + assert len(batch) == self.batch_size + + for img_no, img in enumerate(batch): + sys.stdout.flush() + global_idx = batch_no*self.batch_size + img_no + + img = img.reshape(img_size) -train_data = open(dataf, 'rb') + param_idx = 0 + for mod in self.modules: + # This used to be done _per batch_, + # ie. out of the "for img" loop + p = mod.regenerate_parameters(complexity) + self.params[global_idx, param_idx:param_idx+len(p)] = p + param_idx += len(p) -dim = tuple(ft._read_header(train_data)[3]) + img = mod.transform_image(img) + + self.res_data[global_idx] = \ + img.reshape((img_size[0] * img_size[1],))*255 -res_data = numpy.empty(dim, dtype=numpy.int8) + pylab.imshow(img) + pylab.draw() + + def write_output(self, output_file_path, params_output_file_path): + with open(output_file_path, 'wb') as f: + ft.write(f, self.res_data) -all_settings = ['complexity'] + numpy.save(params_output_file_path, self.params) + + +############################################################################## +# COMPLEXITY ITERATORS +# They're called once every batch, to get the complexity to use for that batch +# they must be infinite (should never throw StopIteration when calling next()) -for mod in mods: - all_settings += mod.get_settings_names() +# probability of generating 0 complexity, otherwise +# uniform over 0.0-max_complexity +def range_complexity_iterator(probability_zero, max_complexity): + assert max_complexity <= 1.0 + n = numpy.random.uniform(0.0, 1.0) + while True: + if n < probability_zero: + yield 0.0 + else: + yield numpy.random.uniform(0.0, max_complexity) + +############################################################################## +# DATA ITERATORS +# They can be used to interleave different data sources etc. + +class NistData(): + def __init__(self, ): + nist_path = DEFAULT_NIST_PATH + self.train_data = open(nist_path, 'rb') + self.dim = tuple(ft._read_header(self.train_data)[3]) -params = numpy.empty(((dim[0]/BATCH_SIZE)+1, len(all_settings))) +def just_nist_iterator(nist, batch_size, stop_after=None): + for i in xrange(0, nist.dim[0], batch_size): + nist.train_data.seek(0) + yield ft.read(nist.train_data, slice(i, i+batch_size)).astype(numpy.float32)/255 + + if not stop_after is None and i >= stop_after: + break + +############################################################################## +# MAIN + +def usage(): + print ''' +Usage: run_pipeline.sh [-m ...] [-z ...] [-o ...] [-p ...] + -m, --max-complexity: max complexity to generate for a batch + -z, --probability-zero: probability of using complexity=0 for a batch + -o, --output-file: full path to file to use for output of images + -p, --params-output-file: path to file to output params to + ''' + +# See run_pipeline.py +def get_argv(): + with open(ARGS_FILE) as f: + args = [l.rstrip() for l in f.readlines()] + return args -for i in xrange(0, dim[0], BATCH_SIZE): - train_data.seek(0) - imgs = ft.read(train_data, slice(i, i+BATCH_SIZE)).astype(numpy.float32)/255 - - complexity = random.random() - p = i/BATCH_SIZE - j = 1 - for mod in mods: - par = mod.regenerate_parameters(complexity) - params[p, j:j+len(par)] = par - j += len(par) - - for k in range(imgs.shape[0]): - c = imgs[k].reshape((32, 32)) - for mod in mods: - c = mod.transform_image(c) - res_data[i+k] = c.reshape((1024,))*255 +# Might be called locally or through dbidispatch. In all cases it should be +# passed to the GIMP executable to be able to use GIMP filters. +# Ex: +def main(): + max_complexity = 0.5 # default + probability_zero = 0.1 # default + output_file_path = None + params_output_file_path = None + stop_after = None -with open(outf, 'wb') as f: - ft.write(f, res_data) + try: + opts, args = getopt.getopt(get_argv(), "m:z:o:p:s:", ["max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "stop-after="]) + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + output = None + verbose = False + for o, a in opts: + if o in ('-m', '--max-complexity'): + max_complexity = float(a) + assert max_complexity >= 0.0 and max_complexity <= 1.0 + elif o in ("-z", "--probability-zero"): + probability_zero = float(a) + assert probability_zero >= 0.0 and probability_zero <= 1.0 + elif o in ("-o", "--output-file"): + output_file_path = a + elif o in ('-p', "--params-output-file"): + params_output_file_path = a + elif o in ('-s', "--stop-after"): + stop_after = int(a) + else: + assert False, "unhandled option" -numpy.save(paramsf, params) + if output_file_path == None or params_output_file_path == None: + print "Must specify both output files." + print + usage() + sys.exit(2) + + nist = NistData() + num_batches = nist.dim[0]/BATCH_SIZE + if stop_after: + num_batches = stop_after + pl = Pipeline(modules=MODULE_INSTANCES, num_batches=num_batches, batch_size=BATCH_SIZE, image_size=(32,32)) + cpx_it = range_complexity_iterator(probability_zero, max_complexity) + batch_it = just_nist_iterator(nist, BATCH_SIZE, stop_after) + + pl.run(batch_it, cpx_it) + pl.write_output(output_file_path, params_output_file_path) + +main() + +pdb.gimp_quit(0) +pylab.ioff() +pylab.show() + diff -r 0f1337994716 -r fdb0e0870fb4 transformations/run_pipeline.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/transformations/run_pipeline.sh Wed Feb 03 17:08:27 2010 -0500 @@ -0,0 +1,20 @@ +#!/bin/bash + +# This is one _ugly_ hack, but I couldn't figure out how +# to cleanly pass command line options to the script if +# invoking using the "gimp --batch < script.py" syntax + +# Basically I create a temp file, put the args into it, +# then the script gets the filename and reads back the +# args + +export PIPELINE_ARGS_TMPFILE=`mktemp` + +for arg in "$@" +do + echo $arg >> $PIPELINE_ARGS_TMPFILE +done + +gimp -i --batch-interpreter python-fu-eval --batch - < pipeline.py + +