# HG changeset patch # User James Bergstra # Date 1301958053 14400 # Node ID 283fb236f10484eb68a277d936b6e3f2e54ca3dd # Parent d9dd09a2ee9092564a4ec495a7ff89fc8fca541e datasets/tinyimages can generate a shuffled file diff -r d9dd09a2ee90 -r 283fb236f104 pylearn/datasets/tinyimages.py --- a/pylearn/datasets/tinyimages.py Mon Apr 04 18:59:18 2011 -0400 +++ b/pylearn/datasets/tinyimages.py Mon Apr 04 19:00:53 2011 -0400 @@ -22,6 +22,8 @@ _tinyimages_root='/data/lisa/data/tinyimages' _original=_tinyimages_root+'/tinyimages/original' _npy_file=_tinyimages_root+'/tinyimages.npy' +_shuffled_npy_file=_tinyimages_root+'/tinyimages_shuffled.npy' +_shuffled_npy_seed=12345 _README_file=_tinyimages_root+'/README.txt' _README = """ TinyImages is a dataset of 32x32 RGB images. @@ -89,15 +91,14 @@ def arrange_first_N_into_tiling(R,C, filename): R=int(R) C=int(C) - A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))], - dtype='float32') - print A.shape - A.shape = (R*C, 32*32,3) - pylearn.io.image_tiling.save_tiled_raster_images( - pylearn.io.image_tiling.tile_raster_images( - (A[:,:,0], A[:,:,1], A[:,:,2], None), - (32,32)), - filename) + if 1: + A = numpy.asarray([i.copy() for i,ii in zip(image_generator(), xrange(R*C))]) + elif 0: + A = get_memmapped_file(R*C) + else: + A = get_shuffled_memmapped_file(R*C) + pylearn.io.image_tiling.tile_slices_to_image_uint8(A, + tile_shape=(R,C)).save(filename) n_images = 1608356 @@ -107,22 +108,48 @@ dtype='uint8', mode='r', shape=(N,32,32,3)) +def get_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file): + return get_memmapped_file(N, filename) -def rebuild_numpy_file(N=n_images, filename=_npy_file): +def rebuild_memmapped_file(N=n_images, filename=_npy_file): shp = (N,32,32,3) print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' open(_README_file, 'w').write(_README) mmap = numpy.memmap(filename, dtype='uint8', - mode='w+', + mode='w+', #create over overwrite file for R/W shape=shp) ig = image_generator() for ii in xrange(N): mmap[ii] = ig.next() mmap.flush() +def rebuild_shuffled_memmapped_file(N=n_images, filename=_shuffled_npy_file, + seed=_shuffled_npy_seed, + orig_filename=_npy_file): + try: + orig = get_memmapped_file(N, orig_filename) + except IOError: + print >> sys.stderr, "pylearn.datasets.tinyimages: rebuild un-shuffled file first" + raise + shp = orig.shape + print >> sys.stderr, "pylearn.datasets.tinyimages rebuilding", filename, shp, N*32*32*3 / float(1024**3), 'gigabytes' + mmap = numpy.memmap(filename, + dtype='uint8', + mode='w+',#create over overwrite file for R/W + shape=shp) + idxlist = numpy.arange(orig.shape[0]) + numpy.random.RandomState(seed).shuffle(idxlist) + assert idxlist[0] != 0 + for i0, i1 in enumerate(idxlist): + mmap[i0] = orig[i1] + if not i0 % 10000: + print>> sys.stderr, "%i/%i"%(i0, len(idxlist)) + mmap.flush() + def main(argv=[]): if argv: + print "Saving images to ", argv[2] arrange_first_N_into_tiling( argv[0], argv[1], argv[2]) else: def iter_len(x):