annotate data_generation/amt/amt_generate.py @ 612:21d53fd07f6e

reviews AISTATS
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 20 Dec 2010 11:54:35 -0500
parents 8973abe35a9d
children
rev   line source
394
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
1 import numpy, Image
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
2 from pylearn.io import filetensor as ft
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
3
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
4
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
5 DATAPATH = '/data/lisa/data/'
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
6 DATASET = 'nist' # nist, p07, pnist
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
7 NUM_BATCHES = 250
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
8 BATCH_SIZE = 10
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
9 IMGSHP = (32,32)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
10 WHITE_SPACE_THICKNESS = 1
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
11 DATASET_PATH = { 'nist' : [ DATAPATH + 'nist/by_class/all/all_test_data.ft',
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
12 DATAPATH + 'nist/by_class/all/all_test_labels.ft' ],
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
13 'p07' : [ DATAPATH + 'ift6266h10/data/P07_test_data.ft',
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
14 DATAPATH + 'ift6266h10/data/P07_test_labels.ft' ],
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
15 'pnist': [ DATAPATH + 'ift6266h10/data/PNIST07_test_data.ft',
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
16 DATAPATH + 'ift6266h10/data/PNIST07_test_labels.ft' ] }
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
17
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
18 def generate_batches():
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
19 # Generate a directory containing NUM_BATCHES of DATASET
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
20 total = NUM_BATCHES * BATCH_SIZE
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
21
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
22 # Create a matrix of random integers within the range
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
23 # [0,lenght_dataset-1]
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
24
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
25 f = open(DATASET_PATH[DATASET][0])
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
26 g = open(DATASET_PATH[DATASET][1])
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
27 test_data = ft.read(f)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
28 test_labels = ft.read(g)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
29
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
30 resulting_data = numpy.zeros((total,IMGSHP[0]*IMGSHP[1]))
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
31 resulting_labels = numpy.zeros((total,))
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
32 f.close();g.close()
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
33
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
34 ds_size = len(test_data)
400
8973abe35a9d Adding comments and fixing out of bounds index
humel
parents: 394
diff changeset
35 rand_seq = numpy.random.random_integers(ds_size-1, size=(NUM_BATCHES,BATCH_SIZE))
394
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
36
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
37 for i in range(NUM_BATCHES):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
38 for j in range(BATCH_SIZE):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
39 resulting_data[i*BATCH_SIZE+j]=test_data[rand_seq[i,j]]
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
40 resulting_labels[i*BATCH_SIZE+j] = test_labels[rand_seq[i,j]]
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
41 image = generate_image(resulting_data[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
42 text = generate_labels(resulting_labels[i*BATCH_SIZE:(i+1)*BATCH_SIZE], rand_seq[i])
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
43 filename = DATASET + '_' + str("%04d" % int(i+1))
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
44 image.save(filename+'.jpeg')
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
45 save_text(text,filename)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
46
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
47 ft_name = 'AMT_'+DATASET+'_'+str(NUM_BATCHES)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
48 generate_ft_file(resulting_data,resulting_labels,ft_name)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
49
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
50 def save_text(text,filename):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
51 f = open(filename+'.txt', 'w')
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
52 f.write(text)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
53 f.close()
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
54 def generate_ft_file(data,labels,ft_name):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
55 fdata = open(ft_name+'_data.ft','w')
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
56 flabels = open(ft_name+'_labels.ft','w')
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
57 ft.write(fdata,data)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
58 ft.write(flabels,labels)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
59 fdata.close();flabels.close()
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
60
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
61 def generate_image(seq):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
62 all_images = []
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
63
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
64 white_space = numpy.asarray(numpy.zeros((IMGSHP[0],WHITE_SPACE_THICKNESS))+255.,dtype='uint8')
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
65
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
66 for i in range(len(seq)):
400
8973abe35a9d Adding comments and fixing out of bounds index
humel
parents: 394
diff changeset
67 all_images += [numpy.asarray(seq[i].reshape((IMGSHP)),dtype='uint8')]
394
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
68
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
69 all_images_stacked = numpy.hstack(numpy.asarray([numpy.hstack((image,white_space)) for image in all_images]))
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
70 return Image.fromarray(all_images_stacked)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
71
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
72 def generate_labels(seq, indexes):
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
73 return str(seq) + '\n' + str(indexes)
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
74
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
75 if __name__ =='__main__':
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
76 print 'Starting data generation'
ae8102aad586 Generation script for AMT data
humel
parents:
diff changeset
77 generate_batches()