comparison data_generation/pipeline/pipeline.py @ 261:6d16a2bf142b

important bug fix in pipeline corresponding to an iterator always giving 0 complexity, need to redo 10% of the P07 dataset
author Xavier Glorot <glorotxa@iro.umontreal.ca>
date Wed, 17 Mar 2010 16:41:16 -0400
parents dd2df78fcf47
children
comparison
equal deleted inserted replaced
256:bd7e50d56d80 261:6d16a2bf142b
62 -y, --seed: the job seed 62 -y, --seed: the job seed
63 -t, --type: [default: 0:full transformations], 1:Nist-friendly transformations 63 -t, --type: [default: 0:full transformations], 1:Nist-friendly transformations
64 ''' 64 '''
65 65
66 try: 66 try:
67 opts, args = getopt.getopt(get_argv(), "rm:z:o:p:x:s:f:l:c:d:a:b:g:y:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", 67 opts, args = getopt.getopt(get_argv(), "r:m:z:o:p:x:s:f:l:c:d:a:b:g:y:t:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=",
68 "stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr=", "seed="]) 68 "stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr=", "seed=","type="])
69 except getopt.GetoptError, err: 69 except getopt.GetoptError, err:
70 # print help information and exit: 70 # print help information and exit:
71 print str(err) # will print something like "option -a not recognized" 71 print str(err) # will print something like "option -a not recognized"
72 usage() 72 usage()
73 pdb.gimp_quit(0) 73 pdb.gimp_quit(0)
76 for o, a in opts: 76 for o, a in opts:
77 if o in ('-y','--seed'): 77 if o in ('-y','--seed'):
78 random.seed(int(a)) 78 random.seed(int(a))
79 numpy.random.seed(int(a)) 79 numpy.random.seed(int(a))
80 80
81 type_pipeline = 0
81 for o, a in opts: 82 for o, a in opts:
82 if o in ('-t','--type'): 83 if o in ('-t','--type'):
83 type_pipeline = int(a) 84 type_pipeline = int(a)
84 else:
85 type_pipeline = 0
86 85
87 if DEBUG_X: 86 if DEBUG_X:
88 import pylab 87 import pylab
89 pylab.ion() 88 pylab.ion()
90 89
179 param_idx = 0 178 param_idx = 0
180 mod_idx = 0 179 mod_idx = 0
181 for mod in self.modules: 180 for mod in self.modules:
182 # This used to be done _per batch_, 181 # This used to be done _per batch_,
183 # ie. out of the "for img" loop 182 # ie. out of the "for img" loop
184 complexity = complexity_iterator.next() 183 complexity = complexity_iterator.next()
185 #better to do a complexity sampling for each transformations in order to have more variability 184 #better to do a complexity sampling for each transformations in order to have more variability
186 #otherwise a lot of images similar to the source are generated (i.e. when complexity is close to 0 (1/8 of the time)) 185 #otherwise a lot of images similar to the source are generated (i.e. when complexity is close to 0 (1/8 of the time))
187 #we need to save the complexity of each transformations and the sum of these complexity is a good indicator of the overall 186 #we need to save the complexity of each transformations and the sum of these complexity is a good indicator of the overall
188 #complexity 187 #complexity
189 self.params[global_idx, mod_idx] = complexity 188 self.params[global_idx, mod_idx] = complexity
213 hook.end_transform_callback(img) 212 hook.end_transform_callback(img)
214 213
215 def write_output(self, output_file_path, params_output_file_path, labels_output_file_path): 214 def write_output(self, output_file_path, params_output_file_path, labels_output_file_path):
216 with open(output_file_path, 'wb') as f: 215 with open(output_file_path, 'wb') as f:
217 ft.write(f, self.res_data) 216 ft.write(f, self.res_data)
218 217
218 #if type_pipeline == 0: #only needed for type 0 pipeline
219 numpy.save(params_output_file_path, self.params) 219 numpy.save(params_output_file_path, self.params)
220 220
221 with open(labels_output_file_path, 'wb') as f: 221 with open(labels_output_file_path, 'wb') as f:
222 ft.write(f, self.res_labels) 222 ft.write(f, self.res_labels)
223 223
224 224
225 ############################################################################## 225 ##############################################################################
230 # probability of generating 0 complexity, otherwise 230 # probability of generating 0 complexity, otherwise
231 # uniform over 0.0-max_complexity 231 # uniform over 0.0-max_complexity
232 def range_complexity_iterator(probability_zero, max_complexity): 232 def range_complexity_iterator(probability_zero, max_complexity):
233 assert max_complexity <= 1.0 233 assert max_complexity <= 1.0
234 n = numpy.random.uniform(0.0, 1.0) 234 n = numpy.random.uniform(0.0, 1.0)
235 n = 2.0 #hack to bug fix, having a min complexity is not necessary and we need the same seed...
235 while True: 236 while True:
236 if n < probability_zero: 237 if n < probability_zero:
237 yield 0.0 238 yield 0.0
238 else: 239 else:
239 yield numpy.random.uniform(0.0, max_complexity) 240 yield numpy.random.uniform(0.0, max_complexity)
371 elif o in ('-g', "--prob-ocr"): 372 elif o in ('-g', "--prob-ocr"):
372 prob_ocr = float(a) 373 prob_ocr = float(a)
373 elif o in ('-y', "--seed"): 374 elif o in ('-y', "--seed"):
374 pass 375 pass
375 elif o in ('-t', "--type"): 376 elif o in ('-t', "--type"):
376 type_pipeline = int(a) 377 pass
377 else: 378 else:
378 assert False, "unhandled option" 379 assert False, "unhandled option"
379 380
380 if output_file_path == None or params_output_file_path == None or labels_output_file_path == None: 381 if output_file_path == None or params_output_file_path == None or labels_output_file_path == None:
381 print "Must specify the three output files." 382 print "Must specify the three output files."