comparison pylearn/datasets/utlc.py @ 1428:3823dbfff6cf

add parameter to randomize the valid and test data.
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 08 Feb 2011 12:57:15 -0500
parents a36d3a406c59
children b0141efbf6a2
comparison
equal deleted inserted replaced
1427:a36d3a406c59 1428:3823dbfff6cf
14 import theano 14 import theano
15 15
16 import pylearn.io.filetensor as ft 16 import pylearn.io.filetensor as ft
17 import config 17 import config
18 18
19 def load_ndarray_dataset(name, normalize=True, transfer=False, normalize_on_the_fly=False): 19 def load_ndarray_dataset(name, normalize=True, transfer=False,
20 normalize_on_the_fly=False, randomize_valid=False,
21 randomize_test=False):
20 """ Load the train,valid,test data for the dataset `name` 22 """ Load the train,valid,test data for the dataset `name`
21 and return it in ndarray format. 23 and return it in ndarray format.
22 24
23 :param normalize: If True, we normalize the train dataset 25 :param normalize: If True, we normalize the train dataset
24 before returning it 26 before returning it
25 :param transfer: If True also return the transfer label 27 :param transfer: If True also return the transfer label(currently only available for ule)
26 :param normalize_on_the_fly: If True, we return a Theano Variable that will give 28 :param normalize_on_the_fly: If True, we return a Theano Variable that will give
27 as output the normalized value. If the user only 29 as output the normalized value. If the user only
28 take a subtensor of that variable, Theano optimization 30 take a subtensor of that variable, Theano optimization
29 should make that we will only have in memory the subtensor 31 should make that we will only have in memory the subtensor
30 portion that is computed in normalized form. We store 32 portion that is computed in normalized form. We store
31 the original data in shared memory in its original dtype. 33 the original data in shared memory in its original dtype.
32 34
33 This is usefull to have the original data in its original 35 This is usefull to have the original data in its original
34 dtype in memory to same memory. Especialy usefull to 36 dtype in memory to same memory. Especialy usefull to
35 be able to use rita and harry with 1G per jobs. 37 be able to use rita and harry with 1G per jobs.
38 :param randomize_valid: Do we randomize the order of the valid set?
39 We always use the same random order
40 If False, return in the same order as downloaded on the web
41 :param randomize_test: Do we randomize the order of the test set?
42 We always use the same random order
43 If False, return in the same order as downloaded on the web
36 """ 44 """
37 assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!" 45 assert not (normalize and normalize_on_the_fly), "Can't normalize in 2 way at the same time!"
38 46
39 assert name in ['avicenna','harry','rita','sylvester','ule'] 47 assert name in ['avicenna','harry','rita','sylvester','ule']
40 common = os.path.join('UTLC','filetensor',name+'_') 48 common = os.path.join('UTLC','filetensor',name+'_')
43 for subset in ['train','valid','test']] 51 for subset in ['train','valid','test']]
44 52
45 train = load_filetensor(trname) 53 train = load_filetensor(trname)
46 valid = load_filetensor(vname) 54 valid = load_filetensor(vname)
47 test = load_filetensor(tename) 55 test = load_filetensor(tename)
56 if randomize_valid:
57 rng = numpy.random.RandomState([1,2,3,4])
58 perm = rng.permutation(valid.shape[0])
59 valid = valid[perm]
60 if randomize_test:
61 rng = numpy.random.RandomState([1,2,3,4])
62 perm = rng.permutation(test.shape[0])
63 test = test[perm]
48 64
49 if normalize or normalize_on_the_fly: 65 if normalize or normalize_on_the_fly:
50 if normalize_on_the_fly: 66 if normalize_on_the_fly:
51 train = theano.shared(train, borrow=True, name=name+"_train") 67 train = theano.shared(train, borrow=True, name=name+"_train")
52 valid = theano.shared(valid, borrow=True, name=name+"_valid") 68 valid = theano.shared(valid, borrow=True, name=name+"_valid")