# HG changeset patch # User boulanni # Date 1265690717 18000 # Node ID cc4be6b25b8ea9be6340383623cd369584173604 # Parent d508f5a8acd028b47d897d1fdc7e52f72f3592f5 Data iterator alternating between NIST/font/captcha, removed the use of batches, keep track of labels (Not fully done yet) diff -r d508f5a8acd0 -r cc4be6b25b8e transformations/pipeline.py --- a/transformations/pipeline.py Mon Feb 08 14:55:22 2010 -0500 +++ b/transformations/pipeline.py Mon Feb 08 23:45:17 2010 -0500 @@ -4,14 +4,14 @@ from __future__ import with_statement # This is intended to be run as a GIMP script -from gimpfu import * +#from gimpfu import * import sys, os, getopt import numpy import filetensor as ft import random -# To debug locally, also call with -s 1 (to stop after 1 batch ~= 100) +# To debug locally, also call with -s 100 (to stop after ~100) # (otherwise we allocate all needed memory, might be loonnng and/or crash # if, lucky like me, you have an age-old laptop creaking from everywhere) DEBUG = True @@ -31,8 +31,8 @@ # (create it, otherwise it'll crash) DEBUG_OUTPUT_DIR = 'debug_out' -BATCH_SIZE = 100 -DEFAULT_NIST_PATH = '/data/lisa/data/nist/by_class/all/all_train_data.ft' +DEFAULT_NIST_PATH = '/data/lisa/data/ift6266h10/train_data.ft' +DEFAULT_LABEL_PATH = '/data/lisa/data/ift6266h10/train_labels.ft' ARGS_FILE = os.environ['PIPELINE_ARGS_TMPFILE'] if DEBUG_X: @@ -72,10 +72,9 @@ END_TRANSFORM_HOOK = [VISUALIZER] class Pipeline(): - def __init__(self, modules, num_batches, batch_size, image_size=(32,32)): + def __init__(self, modules, num_img, image_size=(32,32)): self.modules = modules - self.num_batches = num_batches - self.batch_size = batch_size + self.num_img = num_img self.num_params_stored = 0 self.image_size = image_size @@ -91,66 +90,65 @@ def init_memory(self): self.init_num_params_stored() - total = self.num_batches * self.batch_size + total = self.num_img num_px = self.image_size[0] * self.image_size[1] - self.res_data = numpy.empty((total, num_px)) + self.res_data = numpy.empty((total, num_px), dtype=numpy.uint8) # +1 to store complexity self.params = numpy.empty((total, self.num_params_stored+1)) + self.res_labels = numpy.empty(total, dtype=numpy.int32) - def run(self, batch_iterator, complexity_iterator): + def run(self, img_iterator, complexity_iterator): img_size = self.image_size should_hook_after_each = len(AFTER_EACH_MODULE_HOOK) != 0 should_hook_at_the_end = len(END_TRANSFORM_HOOK) != 0 - for batch_no, batch in enumerate(batch_iterator): + for img_no, (img, label) in enumerate(img_iterator): + sys.stdout.flush() complexity = complexity_iterator.next() - if DEBUG: - print "Complexity:", complexity - assert len(batch) == self.batch_size + global_idx = img_no - for img_no, img in enumerate(batch): - sys.stdout.flush() - global_idx = batch_no*self.batch_size + img_no - - img = img.reshape(img_size) + img = img.reshape(img_size) - param_idx = 1 - # store complexity along with other params - self.params[global_idx, 0] = complexity - for mod in self.modules: - # This used to be done _per batch_, - # ie. out of the "for img" loop - p = mod.regenerate_parameters(complexity) - self.params[global_idx, param_idx:param_idx+len(p)] = p - param_idx += len(p) + param_idx = 1 + # store complexity along with other params + self.params[global_idx, 0] = complexity + for mod in self.modules: + # This used to be done _per batch_, + # ie. out of the "for img" loop + p = mod.regenerate_parameters(complexity) + self.params[global_idx, param_idx:param_idx+len(p)] = p + param_idx += len(p) - img = mod.transform_image(img) + img = mod.transform_image(img) - if should_hook_after_each: - for hook in AFTER_EACH_MODULE_HOOK: - hook.after_transform_callback(img) - - self.res_data[global_idx] = \ - img.reshape((img_size[0] * img_size[1],))*255 + if should_hook_after_each: + for hook in AFTER_EACH_MODULE_HOOK: + hook.after_transform_callback(img) + self.res_data[global_idx] = \ + img.reshape((img_size[0] * img_size[1],))*255 + self.res_labels[global_idx] = label - if should_hook_at_the_end: - for hook in END_TRANSFORM_HOOK: - hook.end_transform_callback(img) + if should_hook_at_the_end: + for hook in END_TRANSFORM_HOOK: + hook.end_transform_callback(img) - def write_output(self, output_file_path, params_output_file_path): + def write_output(self, output_file_path, params_output_file_path, labels_output_file_path): with open(output_file_path, 'wb') as f: ft.write(f, self.res_data) numpy.save(params_output_file_path, self.params) + + with open(labels_output_file_path, 'wb') as f: + ft.write(f, self.res_labels) ############################################################################## # COMPLEXITY ITERATORS -# They're called once every batch, to get the complexity to use for that batch +# They're called once every img, to get the complexity to use for that img # they must be infinite (should never throw StopIteration when calling next()) # probability of generating 0 complexity, otherwise @@ -190,19 +188,25 @@ ''' class NistData(): - def __init__(self, ): - nist_path = DEFAULT_NIST_PATH + def __init__(self, nist_path, label_path): self.train_data = open(nist_path, 'rb') + self.train_labels = open(label_path, 'rb') self.dim = tuple(ft._read_header(self.train_data)[3]) -def just_nist_iterator(nist, batch_size, stop_after=None): - for i in xrange(0, nist.dim[0], batch_size): - if not stop_after is None and i >= stop_after: - break - - nist.train_data.seek(0) - yield ft.read(nist.train_data, slice(i, i+batch_size)).astype(numpy.float32)/255 - +def nist_supp_iterator(nist, prob_font, prob_captcha, num_img): + subtensor = slice(0, num_img) + img = ft.read(nist.train_data, subtensor).astype(numpy.float32)/255 + labels = ft.read(nist.train_labels, subtensor) + + for i in xrange(num_img): + r = numpy.random.rand() + if r<= prob_font: + pass #get font + elif r<= prob_font + prob_captcha: + pass #get captcha + else: + j = numpy.random.randint(num_img) + yield img[j], labels[j] # Mostly for debugging, for the moment, just to see if we can @@ -225,10 +229,15 @@ def usage(): print ''' Usage: run_pipeline.sh [-m ...] [-z ...] [-o ...] [-p ...] - -m, --max-complexity: max complexity to generate for a batch - -z, --probability-zero: probability of using complexity=0 for a batch + -m, --max-complexity: max complexity to generate for an image + -z, --probability-zero: probability of using complexity=0 for an image -o, --output-file: full path to file to use for output of images -p, --params-output-file: path to file to output params to + -r, --labels-output-file: path to file to output labels to + -f, --data-file: path to filetensor (.ft) data file (NIST) + -l, --label-file: path to filetensor (.ft) labels file (NIST labels) + -a, --prob-font: probability of using a raw font image + -b, --prob-captcha: probability of using a captcha image ''' # See run_pipeline.py @@ -245,11 +254,16 @@ probability_zero = 0.1 # default output_file_path = None params_output_file_path = None + labels_output_file_path = None + nist_path = DEFAULT_NIST_PATH + label_path = DEFAULT_LABEL_PATH + prob_font = 0.0 + prob_captcha = 0.0 stop_after = None reload_mode = False try: - opts, args = getopt.getopt(get_argv(), "rm:z:o:p:s:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "stop-after="]) + opts, args = getopt.getopt(get_argv(), "rm:z:o:p:r:s:f:l:a:b:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", "stop-after=", "data-file=", "label-file=", "prob-font=", "prob-captcha="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" @@ -269,13 +283,23 @@ output_file_path = a elif o in ('-p', "--params-output-file"): params_output_file_path = a + elif o in ('-r', "--labels-output-file"): + labels_output_file_path = a elif o in ('-s', "--stop-after"): stop_after = int(a) + elif o in ('-f', "--data-file"): + nist_path = a + elif o in ('-l', "--label-file"): + label_path = a + elif o in ('-a', "--prob-font"): + prob_font = float(a) + elif o in ('-b', "--prob-captcha"): + prob_captcha = float(a) else: assert False, "unhandled option" - if output_file_path == None or params_output_file_path == None: - print "Must specify both output files." + if output_file_path == None or params_output_file_path == None or labels_output_file_path == None: + print "Must specify the three output files." print usage() sys.exit(2) @@ -287,22 +311,21 @@ ''' # This code is yet untested debug_images = DebugImages(DEBUG_IMAGES_PATH) - num_batches = 1 - batch_size = len(debug_images.filelist) - pl = Pipeline(modules=MODULE_INSTANCES, num_batches=num_batches, batch_size=BATCH_SIZE, image_size=(32,32)) - batch_it = debug_images_iterator(debug_images) + num_img = len(debug_images.filelist) + pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) + img_it = debug_images_iterator(debug_images) ''' else: - nist = NistData() - num_batches = nist.dim[0]/BATCH_SIZE + nist = NistData(nist_path, label_path) + num_img = nist.dim[0] if stop_after: - num_batches = stop_after - pl = Pipeline(modules=MODULE_INSTANCES, num_batches=num_batches, batch_size=BATCH_SIZE, image_size=(32,32)) - batch_it = just_nist_iterator(nist, BATCH_SIZE, stop_after) + num_img = stop_after + pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) + img_it = nist_supp_iterator(nist, prob_font, prob_captcha, num_img) cpx_it = range_complexity_iterator(probability_zero, max_complexity) - pl.run(batch_it, cpx_it) - pl.write_output(output_file_path, params_output_file_path) + pl.run(img_it, cpx_it) + pl.write_output(output_file_path, params_output_file_path, labels_output_file_path) _main()