Mercurial > ift6266
diff deep/deep_mlp/job.py @ 626:75dbbe409578
Added code for deep mlp, experiment code to go along with it. Also added code I used to filter the P07 / PNIST07 datasets to keep only digits.
author | fsavard |
---|---|
date | Wed, 16 Mar 2011 13:43:32 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/deep_mlp/job.py Wed Mar 16 13:43:32 2011 -0400 @@ -0,0 +1,335 @@ +#!/usr/bin/env python +# coding: utf-8 + +''' +Launching + +jobman sqlschedules postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/mlp_dumi mlp_jobman.experiment mlp_jobman.conf +'n_hidden={{500,1000,2000}}' +'n_hidden_layers={{2,3}}' +'train_on={{NIST,NISTP,P07}}' +'train_subset={{DIGITS_ONLY,ALL}}' +'learning_rate_log10={{-1.,-2.,-3.}}' + +in mlp_jobman.conf: +rng_seed=1234 +L1_reg=0.0 +L2_reg=0.0 +n_epochs=10 +minibatch_size=20 +''' + +import os, sys, copy, operator, time +import theano +import theano.tensor as T +import numpy +from mlp import MLP +from ift6266 import datasets +from pylearn.io.seriestables import * +import tables +from jobman.tools import DD + +N_INPUTS = 32*32 +REDUCE_EVERY = 250 + +TEST_RUN = False + +TEST_HP = DD({'n_hidden':200, + 'n_hidden_layers': 2, + 'train_on':'NIST', + 'train_subset':'ALL', + 'learning_rate_log10':-2, + 'rng_seed':1234, + 'L1_reg':0.0, + 'L2_reg':0.0, + 'n_epochs':2, + 'minibatch_size':20}) + +########################################### +# digits datasets +# nist_digits is already in NIST_PATH and in ift6266.datasets +# NOTE: for these datasets the test and valid sets are wrong +# (don't correspond to the training set... they're just placeholders) + +from ift6266.datasets.defs import NIST_PATH, DATA_PATH +TRANSFORMED_DIGITS_PATH = '/data/lisatmp/ift6266h10/data/transformed_digits' + +P07_digits = FTDataSet(\ + train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\ + 'data/P07_train'+str(i)+'_data.ft')\ + for i in range(0, 100)], + train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\ + 'data/P07_train'+str(i)+'_labels.ft')\ + for i in range(0,100)], + test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=None) + +#Added PNIST +PNIST07_digits = FTDataSet(train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\ + 'PNIST07_train'+str(i)+'_data.ft')\ + for i in range(0,100)], + train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\ + 'PNIST07_train'+str(i)+'_labels.ft')\ + for i in range(0,100)], + test_data = [os.path.join(DATA_PATH,'data/PNIST07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/PNIST07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/PNIST07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/PNIST07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=None) + + +# building valid_test_datasets +# - on veut des dataset_obj pour les 3 datasets +# - donc juste à bâtir FTDataset(train=nimportequoi, test, valid=pNIST etc.) +# - on veut dans l'array mettre des pointeurs vers la fonction either test ou valid +# donc PAS dataset_obj, mais dataset_obj.train (sans les parenthèses) +def build_test_valid_sets(): + nist_ds = datasets.nist_all() + pnist_ds = datasets.PNIST07() + p07_ds = datasets.nist_P07() + + test_valid_fns = [nist_ds.test, nist_ds.valid, + pnist_ds.test, pnist_ds.valid, + p07_ds.test, p07_ds.valid] + + test_valid_names = ["nist_all__test", "nist_all__valid", + "NISTP__test", "NISTP__valid", + "P07__test", "P07__valid"] + + return test_valid_fns, test_valid_names + +def add_error_series(series, error_name, hdf5_file, + index_names=('minibatch_idx',), use_accumulator=False, + reduce_every=250): + # train + series_base = ErrorSeries(error_name=error_name, + table_name=error_name, + hdf5_file=hdf5_file, + index_names=index_names) + + if use_accumulator: + series[error_name] = \ + AccumulatorSeriesWrapper(base_series=series_base, + reduce_every=reduce_every) + else: + series[error_name] = series_base + +TEST_VALID_FNS,TEST_VALID_NAMES = None, None +def compute_and_save_errors(state, mlp, series, hdf5_file, minibatch_idx): + global TEST_VALID_FNS,TEST_VALID_NAMES + + TEST_VALID_FNS,TEST_VALID_NAMES = build_test_valid_sets() + + # if the training is on digits only, then there'll be a 100% + # error on digits in the valid/test set... just ignore them + + test_fn = theano.function([mlp.input], mlp.logRegressionLayer.y_pred) + + test_batch_size = 100 + for test_ds_fn,test_ds_name in zip(TEST_VALID_FNS,TEST_VALID_NAMES): + # reset error counts for every test/valid set + # note: float + total_errors = total_digit_errors = \ + total_uppercase_errors = total_lowercase_errors = 0. + + total_all = total_lowercase = total_uppercase = total_digit = 0 + + for mb_x,mb_y in test_ds_fn(test_batch_size): + digit_mask = mb_y < 10 + uppercase_mask = mb_y >= 36 + lowercase_mask = numpy.ones((len(mb_x),)) \ + - digit_mask - uppercase_mask + + total_all += len(mb_x) + total_digit += sum(digit_mask) + total_uppercase += sum(uppercase_mask) + total_lowercase += sum(lowercase_mask) + + predictions = test_fn(mb_x) + + all_errors = (mb_y != predictions) + total_errors += sum(all_errors) + + if len(all_errors) != len(digit_mask): + print "size all", all_errors.shape, " digit", digit_mask.shape + total_digit_errors += sum(numpy.multiply(all_errors, digit_mask)) + total_uppercase_errors += sum(numpy.multiply(all_errors, uppercase_mask)) + total_lowercase_errors += sum(numpy.multiply(all_errors, lowercase_mask)) + + four_errors = [float(total_errors) / total_all, + float(total_digit_errors) / total_digit, + float(total_lowercase_errors) / total_lowercase, + float(total_uppercase_errors) / total_uppercase] + + four_errors_names = ["all", "digits", "lower", "upper"] + + # record stats per set + print "Errors on", test_ds_name, ",".join(four_errors_names),\ + ":", ",".join([str(e) for e in four_errors]) + + # now in the state + for err, errname in zip(four_errors, four_errors_names): + error_full_name = 'error__'+test_ds_name+'_'+errname + min_name = 'min_'+error_full_name + minpos_name = 'minpos_'+error_full_name + + if state.has_key(min_name): + if state[min_name] > err: + state[min_name] = err + state[minpos_name] = pos_str + else: + # also create the series + add_error_series(series, error_full_name, hdf5_file, + index_names=('minibatch_idx',)) + state[min_name] = err + state[minpos_name] = minibatch_idx + + state[minpos_name] = pos_str + series[error_full_name].append((minibatch_idx,), err) + +def jobman_entrypoint(state, channel): + global TEST_RUN + minibatch_size = state.minibatch_size + + print_every = 100000 + COMPUTE_ERROR_EVERY = 10**7 / minibatch_size # compute error every 10 million examples + if TEST_RUN: + print_every = 100 + COMPUTE_ERROR_EVERY = 1000 / minibatch_size + + print "entrypoint, state is" + print state + + ###################### + # select dataset and dataset subset, plus adjust epoch num to make number + # of examples seen independent of dataset + # exemple: pour le cas DIGITS_ONLY, il faut changer le nombre d'époques + # et pour le cas NIST pur (pas de transformations), il faut multiplier par 100 + # en partant car on a pas les variations + + # compute this in terms of the P07 dataset size (=80M) + MINIBATCHES_TO_SEE = state.n_epochs * 8 * (10**6) / minibatch_size + + if state.train_on == 'NIST' and state.train_subset == 'ALL': + dataset_obj = datasets.nist_all() + elif state.train_on == 'NIST' and state.train_subset == 'DIGITS_ONLY': + dataset_obj = datasets.nist_digits() + elif state.train_on == 'NISTP' and state.train_subset == 'ALL': + dataset_obj = datasets.PNIST07() + elif state.train_on == 'NISTP' and state.train_subset == 'DIGITS_ONLY': + dataset_obj = PNIST07_digits + elif state.train_on == 'P07' and state.train_subset == 'ALL': + dataset_obj = datasets.nist_P07() + elif state.train_on == 'P07' and state.train_subset == 'DIGITS_ONLY': + dataset_obj = datasets.P07_digits + + dataset = dataset_obj + + if state.train_subset == 'ALL': + n_classes = 62 + elif state.train_subset == 'DIGITS_ONLY': + n_classes = 10 + else: + raise NotImplementedError() + + ############################### + # construct model + + print "constructing model..." + x = T.matrix('x') + y = T.ivector('y') + + rng = numpy.random.RandomState(state.rng_seed) + + # construct the MLP class + model = MLP(rng = rng, input=x, n_in=N_INPUTS, + n_hidden_layers = state.n_hidden_layers, + n_hidden = state.n_hidden, n_out=n_classes) + + + # cost and training fn + cost = T.mean(model.negative_log_likelihood(y)) \ + + state.L1_reg * model.L1 \ + + state.L2_reg * model.L2_sqr + + print "L1, L2: ", state.L1_reg, state.L2_reg + + gradient_nll_wrt_params = [] + for param in model.params: + gparam = T.grad(cost, param) + gradient_nll_wrt_params.append(gparam) + + learning_rate = 10**float(state.learning_rate_log10) + print "Learning rate", learning_rate + + train_updates = {} + for param, gparam in zip(model.params, gradient_nll_wrt_params): + train_updates[param] = param - learning_rate * gparam + + train_fn = theano.function([x,y], cost, updates=train_updates) + + ####################### + # create series + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + series = {} + add_error_series(series, "training_error", h5f, + index_names=('minibatch_idx',), use_accumulator=True, + reduce_every=REDUCE_EVERY) + + ########################## + # training loop + + start_time = time.clock() + + print "begin training..." + print "will train for", MINIBATCHES_TO_SEE, "examples" + + mb_idx = 0 + + while(mb_idx*minibatch_size<nb_max_exemples): + + last_costs = [] + + for mb_x, mb_y in dataset.train(minibatch_size): + if TEST_RUN and mb_idx > 1000: + break + + last_cost = train_fn(mb_x, mb_y) + series["training_error"].append((mb_idx,), last_cost) + + last_costs.append(last_cost) + if (len(last_costs)+1) % print_every == 0: + print "Mean over last", print_every, "minibatches: ", numpy.mean(last_costs) + last_costs = [] + + if (mb_idx+1) % COMPUTE_ERROR_EVERY == 0: + # compute errors + print "computing errors on all datasets..." + print "Time since training began: ", (time.clock()-start_time)/60., "minutes" + compute_and_save_errors(state, model, series, h5f, mb_idx) + + channel.save() + + sys.stdout.flush() + + end_time = time.clock() + + print "-"*80 + print "Finished. Training took", (end_time-start_time)/60., "minutes" + print state + +def run_test(): + global TEST_RUN + from fsml.job_management import mock_channel + TEST_RUN = True + jobman_entrypoint(TEST_HP, mock_channel) + +if __name__ == '__main__': + run_test() +