394
|
1 import numpy, Image
|
|
2 from pylearn.io import filetensor as ft
|
|
3
|
|
4
|
|
5 DATAPATH = '/data/lisa/data/'
|
|
6 DATASET = 'nist' # nist, p07, pnist
|
|
7 NUM_BATCHES = 250
|
|
8 BATCH_SIZE = 10
|
|
9 IMGSHP = (32,32)
|
|
10 WHITE_SPACE_THICKNESS = 1
|
|
11 DATASET_PATH = { 'nist' : [ DATAPATH + 'nist/by_class/all/all_test_data.ft',
|
|
12 DATAPATH + 'nist/by_class/all/all_test_labels.ft' ],
|
|
13 'p07' : [ DATAPATH + 'ift6266h10/data/P07_test_data.ft',
|
|
14 DATAPATH + 'ift6266h10/data/P07_test_labels.ft' ],
|
|
15 'pnist': [ DATAPATH + 'ift6266h10/data/PNIST07_test_data.ft',
|
|
16 DATAPATH + 'ift6266h10/data/PNIST07_test_labels.ft' ] }
|
|
17
|
|
18 def generate_batches():
|
|
19 # Generate a directory containing NUM_BATCHES of DATASET
|
|
20 total = NUM_BATCHES * BATCH_SIZE
|
|
21
|
|
22 # Create a matrix of random integers within the range
|
|
23 # [0,lenght_dataset-1]
|
|
24
|
|
25 f = open(DATASET_PATH[DATASET][0])
|
|
26 g = open(DATASET_PATH[DATASET][1])
|
|
27 test_data = ft.read(f)
|
|
28 test_labels = ft.read(g)
|
|
29
|
|
30 resulting_data = numpy.zeros((total,IMGSHP[0]*IMGSHP[1]))
|
|
31 resulting_labels = numpy.zeros((total,))
|
|
32 f.close();g.close()
|
|
33
|
|
34 ds_size = len(test_data)
|
400
|
35 rand_seq = numpy.random.random_integers(ds_size-1, size=(NUM_BATCHES,BATCH_SIZE))
|
394
|
36
|
|
37 for i in range(NUM_BATCHES):
|
|
38 for j in range(BATCH_SIZE):
|
|
39 resulting_data[i*BATCH_SIZE+j]=test_data[rand_seq[i,j]]
|
|
40 resulting_labels[i*BATCH_SIZE+j] = test_labels[rand_seq[i,j]]
|
|
41 image = generate_image(resulting_data[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
|
|
42 text = generate_labels(resulting_labels[i*BATCH_SIZE:(i+1)*BATCH_SIZE], rand_seq[i])
|
|
43 filename = DATASET + '_' + str("%04d" % int(i+1))
|
|
44 image.save(filename+'.jpeg')
|
|
45 save_text(text,filename)
|
|
46
|
|
47 ft_name = 'AMT_'+DATASET+'_'+str(NUM_BATCHES)
|
|
48 generate_ft_file(resulting_data,resulting_labels,ft_name)
|
|
49
|
|
50 def save_text(text,filename):
|
|
51 f = open(filename+'.txt', 'w')
|
|
52 f.write(text)
|
|
53 f.close()
|
|
54 def generate_ft_file(data,labels,ft_name):
|
|
55 fdata = open(ft_name+'_data.ft','w')
|
|
56 flabels = open(ft_name+'_labels.ft','w')
|
|
57 ft.write(fdata,data)
|
|
58 ft.write(flabels,labels)
|
|
59 fdata.close();flabels.close()
|
|
60
|
|
61 def generate_image(seq):
|
|
62 all_images = []
|
|
63
|
|
64 white_space = numpy.asarray(numpy.zeros((IMGSHP[0],WHITE_SPACE_THICKNESS))+255.,dtype='uint8')
|
|
65
|
|
66 for i in range(len(seq)):
|
400
|
67 all_images += [numpy.asarray(seq[i].reshape((IMGSHP)),dtype='uint8')]
|
394
|
68
|
|
69 all_images_stacked = numpy.hstack(numpy.asarray([numpy.hstack((image,white_space)) for image in all_images]))
|
|
70 return Image.fromarray(all_images_stacked)
|
|
71
|
|
72 def generate_labels(seq, indexes):
|
|
73 return str(seq) + '\n' + str(indexes)
|
|
74
|
|
75 if __name__ =='__main__':
|
|
76 print 'Starting data generation'
|
|
77 generate_batches()
|