Mercurial > pylearn
changeset 837:28ceb345ab78
merge
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Fri, 16 Oct 2009 12:20:57 -0400 |
parents | 788c2c8558eb (diff) 0ba62c55d59f (current diff) |
children | 4f7e0edee7d0 |
files | |
diffstat | 35 files changed, 2436 insertions(+), 2772 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/COIL100.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,62 @@ + +""" +http://www1.cs.columbia.edu/CAVE/software/softlib/coil-100.php + +"Columbia Object Image Library (COIL-100)," + S. A. Nene, S. K. Nayar and H. Murase, + Technical Report CUCS-006-96, February 1996. + +""" + +import os, cPickle +import Image, numpy +from pylearn.datasets.config import data_root # config + +from .memo import memo + +def filenames(): + root = os.path.join(data_root(), 'COIL-100', 'coil-100', ) + for filename in os.listdir(root): + yield filename, os.path.join(root,filename ) + +def filenameidx_imgidx(filename): + if filename.startswith("obj"): + obj_idx = int(filename[3:filename.index("_")]) + img_idx = int(filename[filename.index("_")+2:filename.index(".")]) + return obj_idx, img_idx + else: + raise ValueError(filename) + +_32x32grey_path = os.path.join(data_root(), "COIL-100", "dct_32x32_grey.pkl") +_32x32grey_header = "Dictionary of COIL-100 dataset at 32x32 resolution, greyscale" +def build_32x32_grey(): + f = file(_32x32grey_path, "w") + cPickle.dump(_32x32grey_header, f, protocol=cPickle.HIGHEST_PROTOCOL) + + dct = {} + for filename, fullname in filenames(): + if filename.startswith('obj'): + obj_idx, img_idx = filenameidx_imgidx(filename) + img = numpy.asarray(Image.open(fullname)) + dct.setdefault(obj_idx, {})[img_idx] = img.mean(axis=2)[::4,::4] + rval = numpy.empty((100, 72, 32, 32), dtype='float32') + rval[...] = -1 + for obj_id, dd in dct.iteritems(): + for img_id, v in dd.iteritems(): + rval[obj_id, img_id, :, :] = v + assert numpy.all(rval >= 0.0) + + cPickle.dump(rval, f, protocol=cPickle.HIGHEST_PROTOCOL) + f.close() + +@memo +def get_32x32_grey(): + f = file(_path_32x32_grey) + if _32x32grey_header != cPickle.load(f): + raise ValueError('wrong pickle file') + rval = cPickle.load(f) + f.close() + return rval + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/MNIST.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,126 @@ +"""Regular MNIST using the dataset protocol +""" +import os, numpy +import theano +from pylearn.datasets.config import data_root # config +from pylearn.io.ubyte import read_ubyte_matrix +from protocol import TensorFnDataset # protocol.py __init__.py +from .memo import memo + +@memo +def get_train_img_u8_rasterized(): + """Returns 60000 x 784 MNIST train set""" + return read_ubyte_matrix( + os.path.join(data_root(), 'mnist', 'train-images-idx3-ubyte'), + 60000, 784, 16, + write=False, align=True, as_dtype='uint8') +@memo +def get_test_img_u8_rasterized(): + """Returns 10000 x 784 MNIST test set""" + return read_ubyte_matrix( + os.path.join(data_root(), 'mnist', 't10k-images-idx3-ubyte'), + 10000, 784, 16, + write=False, align=True, as_dtype='uint8') +@memo +def get_train_labels(): + # these are actually uint8, but the nnet classif code is for ints. + return read_ubyte_matrix( + os.path.join(data_root(), 'mnist', 'train-labels-idx1-ubyte'), + 60000, 1, 8, + write=False, align=True, as_dtype='int32').reshape(60000) +@memo +def get_test_labels(): + # these are actually uint8, but the nnet classif code is for ints. + return read_ubyte_matrix( + os.path.join(data_root(), 'mnist', 't10k-labels-idx1-ubyte'), + 10000, 1, 8, + write=False, align=True, as_dtype='int32').reshape(10000) + +#This will cause both the uint8 version and the float version of the dataset to be cached. +# For larger datasets, it would be better to use Theano's cast(x, dtype) to do this conversion +# on the fly. +@memo +def get_train_img_f32_rasterized(): + return get_train_img_u8_rasterized() / numpy.asarray(255, dtype='float32') +@memo +def get_train_img_f64_rasterized(): + return get_train_img_u8_rasterized() / numpy.asarray(255, dtype='float64') +@memo +def get_test_img_f32_rasterized(): + return get_test_img_u8_rasterized() / numpy.asarray(255, dtype='float32') +@memo +def get_test_img_f64_rasterized(): + return get_test_img_u8_rasterized() / numpy.asarray(255, dtype='float64') + +#@constructor +def mnist(s_idx, split, dtype='float64', rasterized=False): + """ + :param s_idx: + + :param split: + + :param dtype: + + :param rasterized: return examples as vectors (True) or 28x28 matrices (False) + + """ + if split not in ('train', 'valid', 'test'): + raise ValueError('split should be train, valid, or test', split) + + if split == 'test': + l_fn = get_test_labels + if dtype == 'uint8': + i_fn = get_test_img_u8_rasterized + elif dtype == 'float32': + i_fn = get_test_img_f32_rasterized + elif dtype == 'float64': + i_fn = get_test_img_f64_rasterized + else: + raise ValueError('invalid dtype', dtype) + else: + l_fn = get_train_labels + if dtype == 'uint8': + i_fn = get_train_img_u8_rasterized + elif dtype == 'float32': + i_fn = get_train_img_f32_rasterized + elif dtype == 'float64': + i_fn = get_train_img_f64_rasterized + else: + raise ValueError('invalid dtype', dtype) + + if split == 'test': + idx = s_idx + elif split == 'train': + idx = s_idx % 50000 + else: #valid + idx = s_idx + 50000 + + x = TensorFnDataset(dtype, (False,), i_fn, (784,))(idx) + y = TensorFnDataset('int32', (), l_fn)(idx) + if x.ndim == 1: + if not rasterized: + x = x.reshape((28,28)) + elif x.ndim == 2: + if not rasterized: + x = x.reshape(x.shape[0], (28,28)) + else: + assert False, 'what happened?' + + return x, y +nclasses = 10 + +def glviewer(part='train'): + from glviewer import GlViewer + if part == 'train': + if 0: + #hack that doesn't use the op + x = get_train_img_u8_rasterized().reshape((60000, 28, 28)) + GlViewer(x.__getitem__).main() + else: + # test that uses the op + i = theano.tensor.iscalar() + f = theano.function([i], mnist(i, 'train', dtype='uint8', rasterized=False)) + GlViewer(f).main() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/README.txt Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,13 @@ +The dataset_ops folder contains Theano Ops that provide dataset access to theano +programs. + +The protocol.py file sets out the basic convention that is followed by the Ops +in the other files. + +For an example of how to set up a dataset whose elements are slices from some +big underlying tensor, see MNIST.py. + +For an example of how to set up a dynamically-generated dataset, see +gldataset.py. + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/__init__.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,4 @@ +import logging +logging.getLogger('dataset_ops').setLevel(logging.INFO) + +from protocol import Dataset, TensorDataset, TensorFnDataset # protocol.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/gldataset.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,490 @@ +"""Demonstrate a complicated dynamically-generated dataset. +""" + +# __init__.py + +import sys, copy, logging, sys + +import Image #PIL + +from OpenGL.GL import * +from OpenGL.GLU import * +from OpenGL.GLUT import * +from pyglew import * + +from glviewer import load_texture + +import numpy + +import theano +from theano.compile.sandbox import shared +from theano.compile.sandbox import pfunc as function + +_logger = logging.getLogger('gldataset') +def debug(*msg): _logger.debug(' '.join(str(m) for m in msg)) +def info(*msg): _logger.info(' '.join(str(m) for m in msg)) +def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) +def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) +def error(*msg): _logger.error(' '.join(str(m) for m in msg)) + +def init_GL(shape=(64,64), title='Offscreen rendering using FB0'): + if not init_GL.done: + w, h = shape + init_GL.done = True + info('initializing OpenGl subsystem') + glutInit (sys.argv) + glutInitDisplayMode (GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH) + glutInitWindowSize (w,h) + init_GL.window = glutCreateWindow (title) + glewInit() + + glEnable(GL_TEXTURE_2D) + glClearColor(0.0, 0.0, 0.0, 0.0) # This Will Clear The Background Color To Black + glClearDepth(1.0) # Enables Clearing Of The Depth Buffer + glDepthFunc(GL_LESS) # The Type Of Depth Test To Do + glEnable(GL_DEPTH_TEST) # Enables Depth Testing + glShadeModel(GL_SMOOTH) # Enables Smooth Color Shading + + #glMatrixMode(GL_PROJECTION) + #glLoadIdentity() # Reset The Projection Matrix + # Calculate The Aspect Ratio Of The Window + #gluPerspective(45.0, float(64)/float(64), 0.1, 100.0) + glMatrixMode(GL_MODELVIEW) +init_GL.done = False + +class PBufRenderer(object): + """Render an OpenGL program to a framebuffer instead of the screen. + + The way to use this class is to enclose all the OpenGL commands you want to render between + a call to setup() and a call to render(). So you would render a frame like this: + + .. code-block:: python + + p = PBufRenderer(shape) + p.setup() + my_display_code() + a = p.render() + my_display_code() + b = p.render() + + After running this code, 'a' and 'b' will be numpy arrays of shape `shape` + (3,) containing an + RBG rendering of your display_code. + """ + def __init__(self, size=(128,128), upside_down=False): + """ Offscreen rendering + + Save an offscreen rendering of size (w,h) to filename. + """ + + def round2 (n): + """ Get nearest power of two superior to n """ + f = 1 + while f<n: + f*= 2 + return f + + if size == None: + size = (512,512) + w = round2 (size[0]) + h = round2 (size[1]) + + image = Image.new ("RGB", (w, h), (0, 0, 0)) + bits = image.tostring("raw", "RGBX", 0, -1) + + debug('allocating framebuffer') + framebuffer = glGenFramebuffersEXT (1) + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer) + + debug('allocating depthbuffer') + depthbuffer = glGenRenderbuffersEXT (1) + glBindRenderbufferEXT (GL_RENDERBUFFER_EXT,depthbuffer) + glRenderbufferStorageEXT (GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT, w, h) + + # Create texture to render to + debug('allocating dynamic texture') + texture = glGenTextures (1) + glBindTexture (GL_TEXTURE_2D, texture) + glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR) + glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR) + # Question: why do we need to upload a texture that we are rendering *to* ? + glTexImage2D (GL_TEXTURE_2D, 0, GL_RGB, w, h, 0, + GL_RGB, GL_UNSIGNED_BYTE, bits) + + # store variables for later use. + self.texture = texture + self.framebuffer = framebuffer + self.depthbuffer = depthbuffer + self.image = image + self.bits = bits + self.size = size + self.texture_size = (w,h) + self.upside_down = upside_down + + # set the screen as output + glBindRenderbufferEXT (GL_RENDERBUFFER_EXT, 0) + glBindFramebufferEXT (GL_FRAMEBUFFER_EXT, 0) + + def __del__(self): + glBindRenderbufferEXT (GL_RENDERBUFFER_EXT, 0) + glBindFramebufferEXT (GL_FRAMEBUFFER_EXT, 0) + glDeleteTextures (1,[self.texture]) + glDeleteFramebuffersEXT (1, [self.framebuffer]) + glDeleteRenderbuffersExt (1, [self.depthbuffer]) + + def setup(self): + glBindRenderbufferEXT (GL_RENDERBUFFER_EXT, self.depthbuffer) + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, self.framebuffer) + glFramebufferTexture2DEXT (GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, + GL_TEXTURE_2D, self.texture, 0); + glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT, + GL_RENDERBUFFER_EXT, self.depthbuffer); + + status = glCheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT); + if status != GL_FRAMEBUFFER_COMPLETE_EXT: + raise RuntimeError( "Error in framebuffer activation") + + # Re-orient viewport + glViewport (0, 0, self.size[0], self.size[1]) + glMatrixMode (GL_PROJECTION) + glLoadIdentity() + gluPerspective (40.,self.size[0]/float(self.size[1]),1.,40.) + glMatrixMode (GL_MODELVIEW) + glLoadIdentity() + gluLookAt (0,0,10, 0,0,0, 0,1,0) + + def render(self): + # TODO: Can we get away with glFlush? + glFinish() #renders to our framebuffer + + # read back the framebuffer to self.image + glBindTexture (GL_TEXTURE_2D, self.texture) + w,h = self.texture_size + data = glReadPixels (0, 0, w, h, GL_RGB, GL_UNSIGNED_BYTE) + rval = numpy.fromstring(data, dtype='uint8', count=w*h*3).reshape((w,h,3)) + if self.size != self.texture_size: + rval = rval[:self.size[0], :self.size[1],:] + + # return to default state of screen rendering + glBindRenderbufferEXT (GL_RENDERBUFFER_EXT, 0) + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0) + if self.upside_down: + return rval + else: + return rval[::-1,:,:] + +class OpenGlMovieFromImage(theano.Op): + """Helper base class to factor code used by Ops that want to make a movie from an input + image, using OpenGL. The subclass specifies how to actually make the movie. + """ + + def __init__(self, width, height, upside_down=False): + """To set up the renderer, we need to know the frame size of the images. + Setting up the renderer for each image is much slower. + """ + init_GL() #global initialization is no-op after first call + self.width=width + self.height=height + self.upside_down=upside_down + + self.renderer = None + # Delay construction of renderer until after merge-optimization + #PBufRenderer((width, height), upside_down=upside_down) + + #TODO: put texture into output state as reusable resource + self.texture = glGenTextures(1) + + def __del__(self): + glDeleteTextures (1,[self.texture]) + + def __eq__(self, other): + return type(self) == type(other) \ + and self.width == other.width \ + and self.height == other.height \ + and self.upside_down == other.upside_down + + def __hash__(self): + return hash(type(self)) ^ hash(self.width) ^ hash(self.height) ^ hash(self.upside_down) + + def make_node(self, x, istate): + _x = theano.tensor.as_tensor_variable(x) + if _x.type.dtype != 'uint8': + raise TypeError('must be 2- or 3-tensor of uint8', x) + if _x.type.broadcastable != (False, False) \ + and _x.type.broadcastable != (False, False, False): + raise TypeError('must be a 2- or 3-tensor of uint8', x) + if not isinstance(istate, theano.Variable): + raise TypeError("variable expected", istate) + o_type = theano.tensor.TensorType(dtype='uint8', broadcastable=[False, False, False, False]) + state_type = theano.gof.type.generic + return theano.Apply(self, [x, istate], [o_type(), state_type()]) + + def perform(self, node, (x, istate), (z_storage, ostate_storage)): + if self.renderer is None: + self.renderer = PBufRenderer((self.width, self.height), upside_down=self.upside_down) + + ostate = copy.deepcopy(istate) + self.renderer.setup() + + glBindTexture(GL_TEXTURE_2D, self.texture) # 2d texture (x and y size) + load_texture(x) + + z = numpy.zeros(self.z_shape, dtype='uint8') + for i in xrange(self.n_frames): + self.perform_set_state(istate, ostate, i) + self.perform_display(x, ostate, i) + di = self.renderer.render() + z[i] = di + + # store output images + z_storage[0] = z + + # store next state ostate_storage + ostate_storage[0] = ostate + +class ImageOnSpinningCube(OpenGlMovieFromImage): + def __init__(self, (n_frames, width, height), upside_down=False): + super(ImageOnSpinningCube, self).__init__(width, height, upside_down=upside_down) + self.n_frames = n_frames + self.z_shape = (n_frames, width, height, 3) + + def __eq__(self, other): + return super(ImageOnSpinningCube, self).__eq__(other) \ + and self.n_frames == other.n_frames \ + + def __hash__(self): + return super(ImageOnSpinningCube, self).__hash__() ^ hash(self.n_frames) + + def new_state(self, speed=10): + return dict( + rot=numpy.asarray((0.,0.,0.)), + drot=numpy.asarray((speed,speed,speed)), + ) + + def perform_set_state(self, istate, ostate, iter): + ostate['rot'] = istate['rot'] + istate['drot'] * iter + + def perform_display(self, x, ostate, i): + # retrieve some state variables related to rendering + xrot,yrot,zrot = ostate['rot'] + dxrot,dyrot,dzrot = ostate['drot'] + + # load x as a texture + glBindTexture(GL_TEXTURE_2D, self.texture) # 2d texture (x and y size) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST) + glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_DECAL) + + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # Clear The Screen And The Depth Buffer + glLoadIdentity() # Reset The View + glTranslatef(0.0,0.0,-5.0) # Move Into The Screen + + glRotatef(xrot,1.0,0.0,0.0) # Rotate The Cube On It's X Axis + glRotatef(yrot,0.0,1.0,0.0) # Rotate The Cube On It's Y Axis + glRotatef(zrot,0.0,0.0,1.0) # Rotate The Cube On It's Z Axis + + glBegin(GL_QUADS) # Start Drawing The Cube + + # Front Face (note that the texture's corners have to match the quad's corners) + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, 1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, 1.0) # Top Left Of The Texture and Quad + + # Back Face + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, -1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, -1.0) # Bottom Left Of The Texture and Quad + + # Top Face + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, 1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, 1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + + # Bottom Face + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, -1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, -1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + + # Right face + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, -1.0, -1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, 1.0, 1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + + # Left Face + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, -1.0, -1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, 1.0, 1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + + glEnd(); # Done Drawing The Cube + +def image_on_spinning_cube(x, shape, upside_down=False): + op = ImageOnSpinningCube(shape, upside_down=upside_down) + istate = shared(op.new_state()) + z, ostate = op(x, istate) + return z, {istate: ostate} + +class BrownianCamera(OpenGlMovieFromImage): + def __init__(self, (n_frames, width, height), upside_down=False): + super(BrownianCamera, self).__init__(width, height, upside_down=upside_down) + self.n_frames = n_frames + self.z_shape = (n_frames, width, height, 3) + + def __eq__(self, other): + return super(self.__class__, self).__eq__(other) \ + and self.n_frames == other.n_frames \ + + def __hash__(self): + return super(self.__class__, self).__hash__() ^ hash(self.n_frames) + + def new_state(self, pos_jitter=(.01,.01,.03), rot_jitter=(4.,4.,4.), seed=23424): + return dict( + pos_jitter=numpy.asarray(pos_jitter), + rot_jitter=numpy.asarray(rot_jitter), + pos0=numpy.asarray((0.,0.,-4.0)), + rot0=numpy.asarray((0.,0.,0.)), + alpha=0.1, + # dynamic things + pos=numpy.asarray((0.,0.,-4.0)), + dpos=numpy.asarray((0.,0.,0.)), + ddpos=numpy.asarray((0.,0.,0.)), + rot=numpy.asarray((0.,0.,0.)), + drot=numpy.asarray((0.,0.,0.)), + ddrot=numpy.asarray((0.,0.,0.)), + rng = numpy.random.RandomState(seed), + ) + + def perform_set_state(self, istate, ostate, iter): + alpha = ostate['alpha'] + if iter == 0: + ostate['pos'] = ostate['pos0'].copy() + ostate['dpos'] *= 0 + ostate['rot'] = ostate['rot0'].copy() + ostate['drot'] *= 0 + ostate['ddpos'] = ostate['rng'].uniform(low=-1,high=1,size=3) * ostate['pos_jitter'] + ostate['ddrot'] = ostate['rng'].uniform(low=-1,high=1,size=3) * ostate['rot_jitter'] + ostate['dpos'] += ostate['ddpos'] + ostate['drot'] += ostate['ddrot'] + ostate['pos'] = (1-alpha)*(ostate['pos'] + ostate['dpos']) + alpha * ostate['pos0'] + ostate['rot'] = (1-alpha)*(ostate['rot'] + ostate['drot']) + alpha * ostate['rot0'] + + def perform_display(self, x, ostate, i): + # retrieve some state variables related to rendering + xrot,yrot,zrot = ostate['rot'] + xpos,ypos,zpos = ostate['pos'] + + # load x as a texture + glBindTexture(GL_TEXTURE_2D, self.texture) # 2d texture (x and y size) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST) + glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_DECAL) + + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # Clear The Screen And The Depth Buffer + glLoadIdentity() # Reset The View + glTranslatef(xpos,ypos,zpos) # Move Into The Screen + + glRotatef(xrot,1.0,0.0,0.0) # Rotate The Cube On It's X Axis + glRotatef(yrot,0.0,1.0,0.0) # Rotate The Cube On It's Y Axis + glRotatef(zrot,0.0,0.0,1.0) # Rotate The Cube On It's Z Axis + + glBegin(GL_QUADS) # Start Drawing The Cube + + # Front Face (note that the texture's corners have to match the quad's corners) + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, 1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, 1.0) # Top Left Of The Texture and Quad + + glEnd(); # Done Drawing The Cube + +_brownian_camera_ops = {} +def brownian_camera(x, shape, upside_down=False, seed=8234, speed=1.0): + if (shape, upside_down) not in _brownian_camera_ops: + _brownian_camera_ops[(shape, upside_down)] = BrownianCamera(shape, upside_down=upside_down) + op = _brownian_camera_ops[(shape, upside_down)] + istate = shared(op.new_state(seed=seed)) + istate.value['pos_jitter'] *= speed + istate.value['rot_jitter'] *= speed + z, ostate = op(x, istate) + return z, [(istate, ostate)] + + +def _dump_to_file(fn, filename='out.pkl', nexamples=1000, n_frames=10, **kwargs): + logging.basicConfig(level=logging.INFO, stream=sys.stderr) + import cPickle, time + + from MNIST import mnist + i = theano.tensor.iscalar() + z, z_updates = fn(mnist(i%50000, 'train', rasterized=False, dtype='uint8')[0], (n_frames, 28,28), **kwargs) + f = function([i], z[:,:,:,0], updates=z_updates) + + t0 = time.time() + rval = [] + for j in xrange(nexamples): + if 0 == j % 100: print >> sys.stderr, j + rval.append(f(j)) + dt = time.time() - t0 + info('Generating ', nexamples, 'examples took', dt, 'seconds.') + info('Generation rate:', nexamples/dt, 'examples per second.') + info('Generated ', nexamples*n_frames, 'frames') + info('Generation rate:', nexamples*n_frames/dt, 'frames per second.') + + cPickle.dump(rval, file(filename, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) +def spinning_cube_dump(filename='spinning_cube.pkl', *args, **kwargs): + return _dump_to_file(fn=image_on_spinning_cube, filename=filename, *args, **kwargs) +def brownian_camera_dump(filename='brownian_camera.pkl', *args, **kwargs): + return _dump_to_file(fn=brownian_camera, filename=filename, *args, **kwargs) +def brownian_camera_dumpN(filename='brownian_cameraN.pkl', nexamples=10, n_frames=5, + n_movies=10, img_shape=(28,28), **kwargs): + logging.basicConfig(level=logging.INFO, stream=sys.stderr) + import cPickle, time + from MNIST import mnist + + s_idx = theano.tensor.iscalar() + inputs_updates = [brownian_camera( + x=mnist(s_idx*n_movies+i, 'train', rasterized=False, dtype='uint8')[0], + shape=(n_frames,)+img_shape, + seed=234234+i, **kwargs) + for i in xrange(n_movies)] + s_input = theano.tensor.stack(*(input for (input,update) in inputs_updates))\ + .reshape((n_movies*n_frames,)+img_shape+(3,)) + s_updates = [] + for i,u in inputs_updates: + s_updates.extend(u) + print s_updates + f = function([s_idx], s_input, updates=s_updates) + + t0 = time.time() + rval = [] + for j in xrange(nexamples): + if 0 == j % 1000: print >> sys.stderr, j + rval.append(f(j)) + dt = time.time() - t0 + info('Generating ', nexamples, 'examples took', dt, 'seconds.') + info('Generation rate:', nexamples/dt, 'examples per second.') + info('Generated ', nexamples*n_movies*n_frames, 'frames') + info('Generation rate:', nexamples*n_movies*n_frames/dt, 'frames per second.') + + cPickle.dump(rval, file(filename, 'w')) + + +def glviewer_from_file(filename='out.pkl'): + logging.basicConfig(level=logging.DEBUG, stream=sys.stderr) + import cPickle + rval = cPickle.load(file(filename)) + from glviewer import GlViewer + GlViewer(rval.__getitem__).main() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/glviewer.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,367 @@ +"""This file provides a very crude image-viewing and movie-viewing mini-application. + +It is provided to assist in the development of datasets whose elements are images or movies. +For an example of how to do this, see the `glviewer` function in MNIST.py . + +Currently, the key controls that navigate the dataset are: + + j - next dataset element + k - previous dataset element + 0 - show image 0 + + J - next frame in current movie + K - previous frame in current movie + ) - show frame 0 of current movie + + q - quit. +""" +# Modified to be an image viewer by James Bergstra Sept 2009 +# +# Ported to PyOpenGL 2.0 by Tarn Weisner Burton 10May2001 +# +# This code was created by Richard Campbell '99 (ported to Python/PyOpenGL by John Ferguson 2000) +# +# The port was based on the lesson5 tutorial module by Tony Colston (tonetheman@hotmail.com). +# +# If you've found this code useful, please let me know (email John Ferguson at hakuin@voicenet.com). +# +# See original source and C based tutorial at http:#nehe.gamedev.net +# + + +import traceback +import time +import string +from OpenGL.GL import * +from OpenGL.GLUT import * +from OpenGL.GLU import * +import sys +from Image import * +import numpy + +import logging +_logger = logging.getLogger('glviewer') +_logger.setLevel(logging.INFO) +def debug(*msg): _logger.debug(' '.join(str(m) for m in msg)) +def info(*msg): _logger.info(' '.join(str(m) for m in msg)) +def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) +def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) +def error(*msg): _logger.error(' '.join(str(m) for m in msg)) + +def load_texture(x): + debug('loading texture with shape', x.shape) + if x.ndim == 2: + if x.dtype == 'uint8': + rows, cols = x.shape + buf = numpy.zeros((rows, cols, 4), dtype=x.dtype) + buf += x.reshape( (rows, cols, 1)) + glPixelStorei(GL_UNPACK_ALIGNMENT,1) + return glTexImage2D(GL_TEXTURE_2D, 0, 3, cols, rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, + buf[::-1].flatten()) + else: + raise NotImplementedError() + elif x.ndim == 3: + rows, cols, channels = x.shape + if x.dtype == 'uint8': + if channels == 4: + return glTexImage2D(GL_TEXTURE_2D, 0, 3, cols, rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, x[::-1].flatten()) + else: + buf = numpy.zeros((rows, cols, 4), dtype=x.dtype) + if channels == 1: + buf += x.reshape( (rows, cols, 1)) + if channels == 3: + buf[:,:,:3] = x + return glTexImage2D(GL_TEXTURE_2D, 0, 3, cols, rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, buf[::-1].flatten()) + else: + raise NotImplementedError() + else: + raise NotImplementedError() + + # if you get here, it means a case was missed + assert 0 + + +class GlViewer(object): + # Number of the glut window. + window = 0 + + view_angle = 28.0 # this makes the edge of the cube match up with the viewport + + def __init__(self, texture_fn): + + # Rotations for cube. + self.xrot = self.yrot = self.zrot = 0.0 + + self.texture = 0 + + self.texture_fn = texture_fn + + self.pos = -1 + self.pos_frame = -1 + self.pos_is_movie = False + self.texture_array = None + + self.win_shape = (256, 256) + self.rot = numpy.zeros(3) + self.drot = numpy.ones(3) * .0 + + def init_LoadTextures(self): + # Create Texture + glBindTexture(GL_TEXTURE_2D, glGenTextures(1)) # 2d texture (x and y size) + self.refresh_texture(0, 0) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST) + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST) + glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_DECAL) + + + # A general OpenGL initialization function. Sets all of the initial parameters. + def init_GL(self): + glEnable(GL_TEXTURE_2D) + glClearColor(0.0, 0.0, 0.0, 0.0) # This Will Clear The Background Color To Black + glClearDepth(1.0) # Enables Clearing Of The Depth Buffer + glDepthFunc(GL_LESS) # The Type Of Depth Test To Do + glEnable(GL_DEPTH_TEST) # Enables Depth Testing + glShadeModel(GL_SMOOTH) # Enables Smooth Color Shading + + glMatrixMode(GL_PROJECTION) + glLoadIdentity() # Reset The Projection Matrix + # Calculate The Aspect Ratio Of The Window + Width, Height = self.win_shape + gluPerspective(self.view_angle, float(Width)/float(Height), 0.1, 100.0) + + glMatrixMode(GL_MODELVIEW) + + def main(self): + # + # texture gen: an iterator over images + # + # Call this function like this: + # python -c 'import MNIST, glviewer; glviewer.main(x for (x,y) in MNIST.MNIST().train())' + # + + #TODO: this advances the iterator un-necessarily... we just want a frame to look at the + # dimensions + + global window + glutInit(sys.argv) + + # Select type of Display mode: + # Double buffer + # RGBA color + # Alpha components supported + # Depth buffer + info('initializing OpenGl subsystem') + ##glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH) + + win_width, win_height = self.win_shape + + # get a 640 x 480 window + ##glutInitWindowSize(win_width, win_height) + + # the window starts at the upper left corner of the screen + glutInitWindowPosition(0, 0) + + # Okay, like the C version we retain the window id to use when closing, but for those of you new + # to Python (like myself), remember this assignment would make the variable local and not global + # if it weren't for the global declaration at the start of main. + window = glutCreateWindow("GlViewer") + + # Register the drawing function with glut, BUT in Python land, at least using PyOpenGL, we need to + # set the function pointer and invoke a function to actually register the callback, otherwise it + # would be very much like the C version of the code. + glutDisplayFunc(self.draw_scene) + + # Uncomment this line to get full screen. + # glutFullScreen() + + # When we are doing nothing, redraw the scene. + glutIdleFunc(self.on_idle) + + # Register the function called when our window is resized. + glutReshapeFunc(self.ReSizeGLScene) + + # Register the function called when the keyboard is pressed. + glutKeyboardFunc(self.keyPressed) + + # create the texture we will use for showing images + self.init_LoadTextures() + + # Initialize our window. + self.init_GL() + + # Start Event Processing Engine + glutMainLoop() + + # The function called when our window is resized (which shouldn't happen if you enable fullscreen, below) + def ReSizeGLScene(self, Width, Height): + if Height == 0: # Prevent A Divide By Zero If The Window Is Too Small + Height = 1 + + glViewport(0, 0, Width, Height) # Reset The Current Viewport And Perspective Transformation + glMatrixMode(GL_PROJECTION) + glLoadIdentity() + gluPerspective(self.view_angle, float(Width)/float(Height), 0.1, 100.0) + glMatrixMode(GL_MODELVIEW) + + self.win_shape = (Width, Height) + + + def refresh_texture(self, new_pos, new_frame): + debug('refresh_texture', new_pos, new_frame, 'current', self.pos, self.pos_frame) + if new_pos != self.pos: + texture_array = None + try: + texture_array = self.texture_fn(new_pos) + except Exception, e: + traceback.print_exc() + + if texture_array is None: + return + # calling the texture_fn can mess up the OpenGL state + # here we set it up again + self.init_GL() + + self.pos_is_movie=False + if texture_array.ndim == 4: + self.pos_is_movie = True + if texture_array.ndim == 3 and texture_array.shape[2] > 4: + self.pos_is_movie = True + + self.pos = new_pos + self.texture_array = texture_array + pos_changed = True + if self.pos_is_movie: + info('example', new_pos, 'is movie of', texture_array.shape[0], 'frames') + else: + info('example', new_pos, 'is still frame') + else: + pos_changed = False + texture_array = self.texture_array + + if new_frame == self.pos_frame and not pos_changed: + # nothing to do + return + + if self.pos_is_movie: + n_frames = texture_array.shape[0] + if n_frames > new_frame: + self.pos_frame = new_frame + load_texture(texture_array[new_frame]) + else: + # current frame goes beyond end of movie + pass + else: + # this example is a static frame + load_texture(texture_array) + + # The main drawing function. + def on_idle(self): + # update state stuff pre-draw + self.draw_scene() + + # update state stuff post draw + self.rot += self.drot + + def draw_scene(self): + + xrot, yrot, zrot = self.rot + + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # Clear The Screen And The Depth Buffer + glLoadIdentity() # Reset The View + glTranslatef(0.0,0.0,-5.0) # Move Into The Screen + + glRotatef(xrot,1.0,0.0,0.0) # Rotate The Cube On It's X Axis + glRotatef(yrot,0.0,1.0,0.0) # Rotate The Cube On It's Y Axis + glRotatef(zrot,0.0,0.0,1.0) # Rotate The Cube On It's Z Axis + + # Note there does not seem to be support for this call. + #glBindTexture(GL_TEXTURE_2D,texture) # Rotate The Pyramid On It's Y Axis + + glBegin(GL_QUADS) # Start Drawing The Cube + + # Front Face (note that the texture's corners have to match the quad's corners) + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, 1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, 1.0) # Top Left Of The Texture and Quad + + # Back Face + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, -1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, -1.0) # Bottom Left Of The Texture and Quad + + # Top Face + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, 1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, 1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + + # Bottom Face + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, -1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, -1.0, -1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + + # Right face + glTexCoord2f(1.0, 0.0); glVertex3f( 1.0, -1.0, -1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f( 1.0, 1.0, -1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f( 1.0, 1.0, 1.0) # Top Left Of The Texture and Quad + glTexCoord2f(0.0, 0.0); glVertex3f( 1.0, -1.0, 1.0) # Bottom Left Of The Texture and Quad + + # Left Face + glTexCoord2f(0.0, 0.0); glVertex3f(-1.0, -1.0, -1.0) # Bottom Left Of The Texture and Quad + glTexCoord2f(1.0, 0.0); glVertex3f(-1.0, -1.0, 1.0) # Bottom Right Of The Texture and Quad + glTexCoord2f(1.0, 1.0); glVertex3f(-1.0, 1.0, 1.0) # Top Right Of The Texture and Quad + glTexCoord2f(0.0, 1.0); glVertex3f(-1.0, 1.0, -1.0) # Top Left Of The Texture and Quad + + glEnd(); # Done Drawing The Cube + + # since this is double buffered, swap the buffers to display what just got drawn. + glutSwapBuffers() + + # The function called whenever a key is pressed. Note the use of Python tuples to pass in: (key, x, y) + def keyPressed(self, *args): + ESCAPE = '\033' + + # EXAMPLE CONTROLS + + if args[0] == 'j': # down + self.refresh_texture(self.pos + 1, 0) + info( 'Current image: ', self.pos) + elif args[0] == 'k': # up + self.refresh_texture(self.pos - 1, 0) + info( 'Current image: ', self.pos) + elif args[0] == '0': # reset to position 0 + self.refresh_texture(0, 0) + info( 'Current image: ', self.pos) + + # FRAME CONTROLS + + elif args[0] == ')': # ')' is shift-0, reset to frame 0 + self.refresh_texture(self.pos, 0) + info( 'Current image: ', self.pos) + elif args[0] == 'J': # advance frame + self.refresh_texture(self.pos, self.pos_frame + 1) + info( 'Next frame') + elif args[0] == 'K': # advance frame + if self.pos_frame: + self.refresh_texture(self.pos, self.pos_frame - 1) + info( 'Previous frame') + else: + warn('Not backing up past frame 0') + + elif args[0] == ESCAPE or args[0]=='q': + sys.exit() + + +if __name__ == '__main__': + + sample_data = numpy.asarray(numpy.random.randint(low=0, high=256, size=(5, 64,64)), + dtype='uint8') + GlViewer(sample_data.__getitem__).main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/memo.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,22 @@ +"""Provide a decorator that caches expensive functions +""" +import logging +_logger = logging.getLogger(__file__) +info = _logger.info +def infop(*args): + info(' '.join(str(a) for a in args)) + +def memo(f): + #TODO: support kwargs to rval. This requires looking up the names of f's parameters to + # construct a proper key. + + #TODO: use weak references instead of a normal dict so that the cache doesn't prevent + # garbage collection + cache = {} + def rval(*args): + if args not in cache: + cache[args] = f(*args) + return cache[args] + rval.__name__ = 'memo@%s'%f.__name__ + return rval +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/protocol.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,83 @@ +"""Convenience base classes to help with writing Dataset ops + +""" + +__docformat__ = "restructuredtext_en" +import theano + +class Dataset(theano.Op): + """ + The basic dataset interface is an expression that maps an integer to a dataset element. + + There is also a minibatch option, in which the expression maps an array of integers to a + list or array of dataset elements. + """ + def __init__(self, single_type, batch_type): + self.single_type = single_type + self.batch_type = batch_type + + def make_node(self, idx): + _idx = theano.tensor.as_tensor_variable(idx) + if not _idx.dtype.startswith('int'): + raise TypeError() + if _idx.ndim == 0: # one example at a time + otype = self.single_type + elif _idx.ndim == 1: #many examples at a time + otype = self.batch_type + else: + raise TypeError(idx) + return theano.Apply(self, [_idx], [otype()]) + + def __eq__(self, other): + return type(self) == type(other) \ + and self.single_type == other.single_type \ + and self.batch_type == other.batch_type + + def __hash__(self): + return hash(type(self)) ^ hash(self.single_type) ^ hash(self.batch_type) + + def __str__(self): + return "%s{%s,%s}" % (self.__class__.__name__, self.single_type, self.batch_type) + + def grad(self, inputs, g_outputs): + return [None for i in inputs] + + +class TensorDataset(Dataset): + """A convenient base class for Datasets whose elements all have the same TensorType. + """ + def __init__(self, dtype, single_broadcastable, single_shape=None, batch_size=None): + single_broadcastable = tuple(single_broadcastable) + single_type = theano.tensor.Tensor( + broadcastable=single_broadcastable, + dtype=dtype, + shape=single_shape) + batch_type = theano.tensor.Tensor( + broadcastable=(False,)+single_type.broadcastable, + dtype=dtype, + shape=(batch_size,)+single_type.shape) + super(TensorDataset, self).__init__(single_type, batch_type) + +class TensorFnDataset(TensorDataset): + def __init__(self, dtype, bcast, fn, single_shape=None, batch_size=None): + super(TensorFnDataset, self).__init__(dtype, bcast, single_shape, batch_size) + self.fn = fn + + def __eq__(self, other): + return super(TensorFnDataset, self).__eq__(other) and self.fn == other.fn + + def __hash__(self): + return super(TensorFnDataset, self).__hash__() ^ hash(self.fn) + + def __str__(self): + try: + return "%s{%s}" % (self.__class__.__name__, self.fn.__name__) + except: + return "%s{%s}" % (self.__class__.__name__, self.fn) + + def perform(self, node, (idx,), (z,)): + x = self.fn() + if idx.ndim == 0: + z[0] = x[int(idx)] + else: + z[0] = x[idx]
--- a/pylearn/datasets/config.py Wed Oct 14 10:19:37 2009 -0400 +++ b/pylearn/datasets/config.py Fri Oct 16 12:20:57 2009 -0400 @@ -4,18 +4,26 @@ Especially, the locations of data files. """ -import os, sys +import os, sys, logging +_logger = logging.getLogger('pylearn.datasets.config') +def debug(*msg): _logger.debug(' '.join(str(m) for m in msg)) +def info(*msg): _logger.info(' '.join(str(m) for m in msg)) +def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) +def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) +def error(*msg): _logger.error(' '.join(str(m) for m in msg)) + + def env_get(key, default, key2 = None): if key2 and os.getenv(key) is None: key=key2 if os.getenv(key) is None: - print >> sys.stderr, "WARNING: Environment variable", key, - print >> sys.stderr, "is not set. Using default of", default - if os.getenv(key) is None: - return default + if env_get.first_warning: + warning("Environment variable", key, 'is not set. Using default of', default) + env_get.first_warning = False + return default else: - return os.getenv(key) - #return default if os.getenv(key) is None else os.getenv(key) + os.getenv(key) +env_get.first_warning = True def data_root(): return env_get('PYLEARN_DATA_ROOT', os.getenv('HOME')+'/data', 'DBPATH')
--- a/pylearn/lib/scan.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -"""Provide Scan and related functions""" -__docformat__ = 'restructedtext en' - -import traceback -import numpy -import theano - -class Scan: - """A Theano loop - - :todo: Implement this, and rewrite `Scan1` to use `Scan` - - - """ - def __init__(self): - raise NotImplementedError() - - -def scan1(*args): - """args should have the form - - x, u, <other variables>, (lambda x_i, y_{i-1}, <other variables> : y_i) - - """ - assert len(args) >= 3 - x, u = args[:2] - lmb = args[-1] - other_inputs = list(args[2:-1]) - - x_this = x[0].type() - y_this = u.type() - y_next = lmb(x_this, y_this, *other_inputs) - if y_next.type != u.type: - raise TypeError('type of lambda recursion must match type of y_prev') - env = theano.Env([x_this, y_this] + other_inputs, [y_next]) - env_var = theano.Constant(data=env, type=theano.generic) - return _scan1(*([env_var] + list(args[:-1]))) - -class Scan1(theano.Op): - """A Theano loop over one variable - - Scan1 is less general than `Scan` because it permits looping only over one tensor. - - Scan1 is defined mathematically like this: - - input - iterable x - input - y-element-like u - input - function x[i], y[i-1] -> y[i] - output - iterable y - - .. code-block:: python - - #inputs - x #a tensor with ndim >= 1 - u #a tensor that is like a row of y - f #the function to scan over x - - for i in xrange(len(x)): - if i > 0: - y[i] = f(x[i], y[i-1]) - else: - y[0] = f(x[0], u) - - #outputs - y # a tensor with the same number of elements as x, - # each element of which is like u (in terms of shape and dtype) - - The Scan1 Op works by representing `f` by an `Env`. - - :note: - Internally, the representation is off-by-one wrt the documentation above. This Op creates - a tensor y whose len is greater by one than x, whose first element is a copy of u. - The `Scan1.__call__()` returns a subtensor view of this internal vector `y` that views only - the len-1 last elements, so the copy of `u` is not visible. - - - :todo: - Optimize for the case where y_this is not required to compute y_next. - This makes all the updates possible in parallel, it also makes the `u` argument to - make_node un-necessary. - - """ - - destroy_map = {} - view_map = {} - mode=None - default_output = 0 - - def make_node(self, env, *inputs): - """ - :note: - make_node must take all the same inputs as Apply holds, - so we use __call__ as the syntactic device that inserts self.extra_variables. - """ - x, u = inputs[:2] - - out_type = theano.tensor.Tensor(dtype=u.dtype, - broadcastable=[False] + list(u.broadcastable)) - return theano.Apply(self, [env]+list(inputs), [out_type(), theano.generic()]) - - def grad(self, inputs, (g_y, g_fn)): - assert g_fn is None - - y = self(*inputs) - grads = scan1_grad(g_y, y, *inputs) - return [None] + grads[:-1] - - def perform(self, node, args, (y_out, fn_out)): - - env, x, u = args[:3] - other_args = args[3:] - - if fn_out[0] is None: - assert len(env.outputs) == 1 - fn_out[0] = theano.function( - inputs=env.inputs, - outputs=env.outputs[0], - mode=self.mode) - fn = fn_out[0] - - y_shape = (x.shape[0]+1,) + u.shape - y = numpy.empty(y_shape, dtype=u.dtype) - - #print 'x', x - #print 'y', y - #print 'u', u - #print 'other', other_args - - y[0] = u - for i, x_i in enumerate(x): - something = fn(x_i, y[i], *other_args) - #print 'something', something - y[i+1] = something - y_out[0] = y -_scan1 = Scan1() - - -class Scan1Grad(theano.Op): - def __init__(self, inplace=False): - self.inplace = inplace - if inplace: - self.destroy_map = {1: [3]} - - def make_node(self, g_y, y, scan_env, x, u, *other_inputs): - return theano.Apply(self, - [g_y, y, scan_env, x, u] + list(other_inputs), - [x.type(), u.type()] + [oi.type() for oi in other_inputs] + [theano.generic()]) - - def get_fn(self, scan_env, grad_storage): - fn_storage = grad_storage[-1] - assert isinstance(scan_env, theano.gof.Env) - if fn_storage[0] is None: - y_next = scan_env.outputs[0] - gy_next = y_next.type() - inputs = scan_env.inputs # x_this, y_this, *rest - g_inputs = theano.tensor.grad(y_next, inputs, g_cost=gy_next) - - fn_storage[0] = theano.function( - inputs=[gy_next] + inputs, - outputs=g_inputs) - return fn_storage[0] - - def perform(self, node, args, grad_storage): - - #retrieve (or compute) the gradient function - fn = self.get_fn(args[2], grad_storage) - - #unpack the args - (g_y, y) = args[0:2] - (x, u) = args[3:5] - other_args = args[5:] - - #unpack grad_storage (outputs) - gx_out, gu_out = grad_storage[0:2] - g_other_storage = grad_storage[2:-1] - - assert len(other_args) == len(g_other_storage) - - if not self.inplace: - g_y = g_y.copy() - - gx = numpy.zeros_like(x) - - g_other = [numpy.zeros_like(other) for other in other_args] - - for i in xrange(len(x)-1, -1, -1): - #print 'x y gy_next', x[i], y[i], g_y[i+1] - grads = fn(g_y[i+1], x[i], y[i], *other_args) - gx[i], gy_i = grads[0:2] - #print 'gx gy', gx[i], gy_i - g_y[i] += gy_i - - #now increment the other-input gradient buffers - assert len(g_other) == (len(grads)-2) - for g_arg_buffer, g_arg in zip(g_other, grads[2:]): - g_arg_buffer += g_arg - - #write results into storage locations - gx_out[0] = gx - gu_out[0] = g_y[0] - assert len(g_other_storage) == len(g_other) - for grad_storage, grad in zip(g_other_storage, g_other): - grad_storage[0] = grad - -scan1_grad = Scan1Grad()
--- a/pylearn/lib/test_scan.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -import numpy -import theano -from theano.tensor import dscalar, dvector, dmatrix -from scan import scan1 - - -def test_extra_inputs(): - u = dscalar('u') - c = dscalar('c') - x = dvector('x') - - y = scan1(x, u, c, lambda x_i, y_prev, c: (x_i + y_prev) * c) - - sum_y = theano.tensor.sum(y) - - f = theano.function([x,u, c], y) - - xval = numpy.asarray([1., 1, 1. , 1, 1]) - uval = numpy.asarray(2.) - - yval = f(xval, uval, 2.0) - assert numpy.all(yval == [2., 6., 14., 30., 62., 126.]) - - - - g_x = theano.tensor.grad(sum_y, x) - g_u = theano.tensor.grad(sum_y, u) - - gf = theano.function([x, u, c], [g_x, g_u]) - - gxval, guval = gf(xval, uval, 2.0) - - #print gxval - #print guval - assert numpy.all(gxval == [ 62., 30., 14., 6., 2.]) - assert numpy.all(guval == 63) - - -def test_verify_scan_grad(): - def scanxx(x, u, c): - # u = dvector('u') - # c = dvector('c') - # x = dmatrix('x') - y = scan1(x, u, c, lambda x_i, y_prev, c: (x_i + y_prev) * c) - return y - - rng = numpy.random.RandomState(456) - - xval = rng.rand(4, 3) - uval = rng.rand(3) - cval = rng.rand(3) - - theano.tensor.verify_grad(scanxx, (xval, uval, cval), rng=rng) -
--- a/pylearn/old_dataset/_test_dataset.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,680 +0,0 @@ -#!/bin/env python -from dataset import * -from math import * -import numpy, unittest, sys -#from misc import * -from lookup_list import LookupList - -def have_raised(to_eval, **var): - have_thrown = False - try: - eval(to_eval) - except : - have_thrown = True - return have_thrown - -def have_raised2(f, *args, **kwargs): - have_thrown = False - try: - f(*args, **kwargs) - except : - have_thrown = True - return have_thrown - -def test1(): - print "test1" - global a,ds - a = numpy.random.rand(10,4) - print a - ds = ArrayDataSet(a,{'x':slice(3),'y':3,'z':[0,2]}) - print "len(ds)=",len(ds) - assert(len(ds)==10) - print "example 0 = ",ds[0] -# assert - print "x=",ds["x"] - print "x|y" - for x,y in ds("x","y"): - print x,y - minibatch_iterator = ds.minibatches(fieldnames=['z','y'],n_batches=1,minibatch_size=3,offset=4) - minibatch = minibatch_iterator.__iter__().next() - print "minibatch=",minibatch - for var in minibatch: - print "var=",var - print "take a slice and look at field y",ds[1:6:2]["y"] - - del a,ds,x,y,minibatch_iterator,minibatch,var - -def test_iterate_over_examples(array,ds): -#not in doc!!! - i=0 - for example in range(len(ds)): - wanted = array[example][:3] - returned = ds[example]['x'] - if (wanted != returned).all(): - print 'returned:', returned - print 'wanted:', wanted - assert (ds[example]['x']==array[example][:3]).all() - assert ds[example]['y']==array[example][3] - assert (ds[example]['z']==array[example][[0,2]]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for example in dataset: - i=0 - for example in ds: - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,... in dataset: - i=0 - for x,y,z in ds: - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - -# - for example in dataset(field1, field2,field3, ...): - i=0 - for example in ds('x','y','z'): - assert len(example)==3 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (example['z']==array[i][0:3:2]).all() - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - i=0 - for example in ds('y','x'): - assert len(example)==2 - assert (example['x']==array[i][:3]).all() - assert example['y']==array[i][3] - assert (numpy.append(example['x'],example['y'])==array[i]).all() - i+=1 - assert i==len(ds) - del example,i - -# - for val1,val2,val3 in dataset(field1, field2,field3): - i=0 - for x,y,z in ds('x','y','z'): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,z,i - i=0 - for y,x in ds('y','x',): - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==len(ds) - del x,y,i - - def test_minibatch_size(minibatch,minibatch_size,len_ds,nb_field,nb_iter_finished): - ##full minibatch or the last minibatch - for idx in range(nb_field): - test_minibatch_field_size(minibatch[idx],minibatch_size,len_ds,nb_iter_finished) - del idx - def test_minibatch_field_size(minibatch_field,minibatch_size,len_ds,nb_iter_finished): - assert len(minibatch_field)==minibatch_size or ((nb_iter_finished*minibatch_size+len(minibatch_field))==len_ds and len(minibatch_field)<minibatch_size) - -# - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): - i=0 - mi=0 - size=3 - m=ds.minibatches(['x','z'], minibatch_size=size) - assert hasattr(m,'__iter__') - for minibatch in m: - assert isinstance(minibatch,LookupList) - assert len(minibatch)==2 - test_minibatch_size(minibatch,size,len(ds),2,mi) - if type(ds)==ArrayDataSet: - assert (minibatch[0][:,::2]==minibatch[1]).all() - else: - for j in xrange(len(minibatch[0])): - (minibatch[0][j][::2]==minibatch[1][j]).all() - mi+=1 - i+=len(minibatch[0]) - assert i==(len(ds)/size)*size - assert mi==(len(ds)/size) - del minibatch,i,m,mi,size - - i=0 - mi=0 - size=3 - m=ds.minibatches(['x','y'], minibatch_size=size) - assert hasattr(m,'__iter__') - for minibatch in m: - assert isinstance(minibatch,LookupList) - assert len(minibatch)==2 - test_minibatch_size(minibatch,size,len(ds),2,mi) - mi+=1 - for id in range(len(minibatch[0])): - assert (numpy.append(minibatch[0][id],minibatch[1][id])==array[i]).all() - i+=1 - assert i==(len(ds)/size)*size - assert mi==(len(ds)/size) - del minibatch,i,id,m,mi,size - -# - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): - i=0 - mi=0 - size=3 - m=ds.minibatches(['x','z'], minibatch_size=size) - assert hasattr(m,'__iter__') - for x,z in m: - test_minibatch_field_size(x,size,len(ds),mi) - test_minibatch_field_size(z,size,len(ds),mi) - for id in range(len(x)): - assert (x[id][::2]==z[id]).all() - i+=1 - mi+=1 - assert i==(len(ds)/size)*size - assert mi==(len(ds)/size) - del x,z,i,m,mi,size - - i=0 - mi=0 - size=3 - m=ds.minibatches(['x','y'], minibatch_size=3) - assert hasattr(m,'__iter__') - for x,y in m: - assert len(x)==size - assert len(y)==size - test_minibatch_field_size(x,size,len(ds),mi) - test_minibatch_field_size(y,size,len(ds),mi) - mi+=1 - for id in range(len(x)): - assert (numpy.append(x[id],y[id])==array[i]).all() - i+=1 - assert i==(len(ds)/size)*size - assert mi==(len(ds)/size) - del x,y,i,id,m,mi,size - -#not in doc - i=0 - size=3 - m=ds.minibatches(['x','y'],n_batches=1,minibatch_size=size,offset=4) - assert hasattr(m,'__iter__') - for x,y in m: - assert len(x)==size - assert len(y)==size - for id in range(size): - assert (numpy.append(x[id],y[id])==array[i+4]).all() - i+=1 - assert i==size - del x,y,i,id,m,size - - i=0 - size=3 - m=ds.minibatches(['x','y'],n_batches=2,minibatch_size=size,offset=4) - assert hasattr(m,'__iter__') - for x,y in m: - assert len(x)==size - assert len(y)==size - for id in range(size): - assert (numpy.append(x[id],y[id])==array[i+4]).all() - i+=1 - assert i==2*size - del x,y,i,id,m,size - - i=0 - size=3 - m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=size,offset=4) - assert hasattr(m,'__iter__') - for x,y in m: - assert len(x)==size - assert len(y)==size - for id in range(size): - assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all() - i+=1 - assert i==2*size # should not wrap - del x,y,i,id,size - - assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0) - assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0) - -def test_ds_iterator(array,iterator1,iterator2,iterator3): - l=len(iterator1) - i=0 - for x,y in iterator1: - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==l - i=0 - for y,z in iterator2: - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - i+=1 - assert i==l - i=0 - for x,y,z in iterator3: - assert (x==array[i][:3]).all() - assert y==array[i][3] - assert (z==array[i][0:3:2]).all() - assert (numpy.append(x,y)==array[i]).all() - i+=1 - assert i==l - -def test_getitem(array,ds): - def test_ds(orig,ds,index): - i=0 - assert isinstance(ds,LookupList) - assert len(ds)==3 - assert len(ds[0])==len(index) -# for x,z,y in ds('x','z','y'): - for idx in index: - assert (orig[idx]['x']==array[idx][:3]).all() - assert (orig[idx]['x']==ds['x'][i]).all() - assert orig[idx]['y']==array[idx][3] - assert (orig[idx]['y']==ds['y'][i]).all() # why does it crash sometimes? - assert (orig[idx]['z']==array[idx][0:3:2]).all() - assert (orig[idx]['z']==ds['z'][i]).all() - i+=1 - del i - ds[0] - if len(ds)>2: - ds[:1] - ds[1:1] - ds[1:1:1] - if len(ds)>5: - ds[[1,2,3]] - for x in ds: - pass - -#ds[:n] returns a LookupList with the n first examples. - ds2=ds[:3] - test_ds(ds,ds2,index=[0,1,2]) - del ds2 - -#ds[i:j] returns a LookupList with examples i,i+1,...,j-1. - ds2=ds[1:3] - test_ds(ds,ds2,index=[1,2]) - del ds2 - -#ds[i1:i2:s] returns a LookupList with the examples i1,i1+s,...i2-s. - ds2=ds[1:7:2] - test_ds(ds,ds2,[1,3,5]) - del ds2 - -#ds[i] returns the (i+1)-th example of the dataset. - ds2=ds[5] - assert isinstance(ds2,Example) - test_ds(ds,ds2,[5]) - assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined - assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) - del ds2 - -#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. - ds2=ds[[4,7,2,8]] -# assert isinstance(ds2,DataSet) - test_ds(ds,ds2,[4,7,2,8]) - del ds2 - - #ds.<property># returns the value of a property associated with - #the name <property>. The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) - - #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds - assert have_raised2(hstack,[ds('x'),ds('x')]) - assert have_raised2(hstack,[ds('y','x'),ds('x')]) - assert not have_raised2(hstack,[ds('x'),ds('y')]) - - # i=0 - # for example in hstack([ds('x'),ds('y'),ds('z')]): - # example==ds[i] - # i+=1 - # del i,example - #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - -def test_subset(array,ds): - def test_ds(orig,ds,index): - i=0 - assert isinstance(ds2,DataSet) - assert len(ds)==len(index) - for x,z,y in ds('x','z','y'): - assert (orig[index[i]]['x']==array[index[i]][:3]).all() - assert (orig[index[i]]['x']==x).all() - assert orig[index[i]]['y']==array[index[i]][3] - assert orig[index[i]]['y']==y - assert (orig[index[i]]['z']==array[index[i]][0:3:2]).all() - assert (orig[index[i]]['z']==z).all() - i+=1 - del i - ds[0] - if len(ds)>2: - ds[:1] - ds[1:1] - ds[1:1:1] - if len(ds)>5: - ds[[1,2,3]] - for x in ds: - pass - -#ds[:n] returns a dataset with the n first examples. - ds2=ds.subset[:3] - test_ds(ds,ds2,index=[0,1,2]) -# del ds2 - -#ds[i1:i2:s]# returns a ds with the examples i1,i1+s,...i2-s. - ds2=ds.subset[1:7:2] - test_ds(ds,ds2,[1,3,5]) -# del ds2 - -# #ds[i] -# ds2=ds.subset[5] -# assert isinstance(ds2,Example) -# assert have_raised("var['ds']["+str(len(ds))+"]",ds=ds) # index not defined -# assert not have_raised("var['ds']["+str(len(ds)-1)+"]",ds=ds) -# del ds2 - -#ds[[i1,i2,...in]]# returns a ds with examples i1,i2,...in. - ds2=ds.subset[[4,7,2,8]] - test_ds(ds,ds2,[4,7,2,8]) -# del ds2 - -#ds.<property># returns the value of a property associated with - #the name <property>. The following properties should be supported: - # - 'description': a textual description or name for the ds - # - 'fieldtypes': a list of types (one per field) - -#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#???? - #assert hstack([ds('x','y'),ds('z')])==ds - #hstack([ds('z','y'),ds('x')])==ds - assert have_raised2(hstack,[ds('x'),ds('x')]) - assert have_raised2(hstack,[ds('y','x'),ds('x')]) - assert not have_raised2(hstack,[ds('x'),ds('y')]) - -# i=0 -# for example in hstack([ds('x'),ds('y'),ds('z')]): -# example==ds[i] -# i+=1 -# del i,example -#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#???? - -def test_fields_fct(ds): - #@todo, fill correctly - assert len(ds.fields())==3 - i=0 - v=0 - for field in ds.fields(): - for field_value in field: # iterate over the values associated to that field for all the ds examples - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - i=0 - v=0 - for field in ds('x','z').fields(): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field in ds.fields('x','y'): - i+=1 - for val in field: - v+=1 - assert i==2 - assert v==2*10 - del i,v - - i=0 - v=0 - for field_examples in ds.fields(): - for example_value in field_examples: - v+=1 - i+=1 - assert i==3 - assert v==3*10 - del i,v - - assert ds == ds.fields().examples() - assert len(ds('x','y').fields()) == 2 - assert len(ds('x','z').fields()) == 2 - assert len(ds('y').fields()) == 1 - - del field - -def test_overrides(ds) : - """ Test for examples that an override __getitem__ acts as the one in DataSet """ - def ndarray_list_equal(nda,l) : - """ - Compares if a ndarray is the same as the list. Do it by converting the list into - an numpy.ndarray, if possible - """ - try : - l = numpy.asmatrix(l) - except : - return False - return smart_equal(nda,l) - - def smart_equal(a1,a2) : - """ - Handles numpy.ndarray, LookupList, and basic containers - """ - if not isinstance(a1,type(a2)) and not isinstance(a2,type(a1)): - #special case: matrix vs list of arrays - if isinstance(a1,numpy.ndarray) : - return ndarray_list_equal(a1,a2) - elif isinstance(a2,numpy.ndarray) : - return ndarray_list_equal(a2,a1) - return False - # compares 2 numpy.ndarray - if isinstance(a1,numpy.ndarray): - if len(a1.shape) != len(a2.shape): - return False - for k in range(len(a1.shape)) : - if a1.shape[k] != a2.shape[k]: - return False - return (a1==a2).all() - # compares 2 lookuplists - if isinstance(a1,LookupList) : - if len(a1._names) != len(a2._names) : - return False - for k in a1._names : - if k not in a2._names : - return False - if not smart_equal(a1[k],a2[k]) : - return False - return True - # compares 2 basic containers - if hasattr(a1,'__len__'): - if len(a1) != len(a2) : - return False - for k in range(len(a1)) : - if not smart_equal(a1[k],a2[k]): - return False - return True - # try basic equals - return a1 is a2 - - def mask(ds) : - class TestOverride(type(ds)): - def __init__(self,ds) : - self.ds = ds - def __getitem__(self,key) : - res1 = self.ds[key] - res2 = DataSet.__getitem__(ds,key) - assert smart_equal(res1,res2) - return res1 - return TestOverride(ds) - # test getitem - ds2 = mask(ds) - for k in range(10): - res = ds2[k] - res = ds2[1:len(ds):3] - - - - - - -def test_all(array,ds): - assert len(ds)==10 - test_iterate_over_examples(array, ds) - test_overrides(ds) - test_getitem(array, ds) - test_subset(array, ds) - test_ds_iterator(array,ds('x','y'),ds('y','z'),ds('x','y','z')) - test_fields_fct(ds) - - -class T_DataSet(unittest.TestCase): - def test_ArrayDataSet(self): - #don't test stream - #tested only with float value - #don't always test with y - #don't test missing value - #don't test with tuple - #don't test proterties - a2 = numpy.random.rand(10,4) - ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested - ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - #assert ds==a? should this work? - - test_all(a2,ds) - - del a2, ds - - def test_CachedDataSet(self): - a = numpy.random.rand(10,4) - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - ds2 = CachedDataSet(ds1) - ds3 = CachedDataSet(ds1,cache_all_upon_construction=True) - - test_all(a,ds2) - test_all(a,ds3) - - del a,ds1,ds2,ds3 - - - def test_DataSetFields(self): - raise NotImplementedError() - - def test_ApplyFunctionDataSet(self): - a = numpy.random.rand(10,4) - a2 = a+1 - ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested - - ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False) - ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1), - ['x','y','z'], - minibatch_mode=True) - - test_all(a2,ds2) - test_all(a2,ds3) - - del a,ds1,ds2,ds3 - - def test_FieldsSubsetDataSet(self): - a = numpy.random.rand(10,4) - ds = ArrayDataSet(a,Example(['x','y','z','w'],[slice(3),3,[0,2],0])) - ds = FieldsSubsetDataSet(ds,['x','y','z']) - - test_all(a,ds) - - del a, ds - - def test_RenamedFieldsDataSet(self): - a = numpy.random.rand(10,4) - ds = ArrayDataSet(a,Example(['x1','y1','z1','w1'],[slice(3),3,[0,2],0])) - ds = RenamedFieldsDataSet(ds,['x1','y1','z1'],['x','y','z']) - - test_all(a,ds) - - del a, ds - - def test_MinibatchDataSet(self): - raise NotImplementedError() - def test_HStackedDataSet(self): - raise NotImplementedError() - def test_VStackedDataSet(self): - raise NotImplementedError() - def test_ArrayFieldsDataSet(self): - raise NotImplementedError() - - -class T_Exotic1(unittest.TestCase): - class DataSet(DataSet): - """ Dummy dataset, where one field is a ndarray of variables size. """ - def __len__(self) : - return 100 - def fieldNames(self) : - return 'input','target','name' - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class MultiLengthDataSetIterator(object): - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - if fieldnames is None: fieldnames = dataset.fieldNames() - self.minibatch = Example(fieldnames,range(len(fieldnames))) - self.dataset, self.minibatch_size, self.current = dataset, minibatch_size, offset - def __iter__(self): - return self - def next(self): - for k in self.minibatch._names : - self.minibatch[k] = [] - for ex in range(self.minibatch_size) : - if 'input' in self.minibatch._names: - self.minibatch['input'].append( numpy.array( range(self.current + 1) ) ) - if 'target' in self.minibatch._names: - self.minibatch['target'].append( self.current % 2 ) - if 'name' in self.minibatch._names: - self.minibatch['name'].append( str(self.current) ) - self.current += 1 - return self.minibatch - return MultiLengthDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - def test_ApplyFunctionDataSet(self): - ds = T_Exotic1.DataSet() - dsa = ApplyFunctionDataSet(ds,lambda x,y,z: (x[-1],y*10,int(z)),['input','target','name'],minibatch_mode=False) #broken!!!!!! - for k in range(len(dsa)): - res = dsa[k] - self.failUnless(ds[k]('input')[0][-1] == res('input')[0] , 'problem in first applied function') - res = dsa[33:96:3] - - def test_CachedDataSet(self): - ds = T_Exotic1.DataSet() - dsc = CachedDataSet(ds) - for k in range(len(dsc)) : - self.failUnless(numpy.all( dsc[k]('input')[0] == ds[k]('input')[0] ) , (dsc[k],ds[k]) ) - res = dsc[:] - -if __name__=='__main__': - tests = [] - debug=False - if len(sys.argv)==1: - unittest.main() - else: - assert sys.argv[1]=="--debug" - for arg in sys.argv[2:]: - tests.append(arg) - if tests: - unittest.TestSuite(map(T_DataSet, tests)).debug() - else: - module = __import__("_test_dataset") - tests = unittest.TestLoader().loadTestsFromModule(module) - tests.debug()
--- a/pylearn/old_dataset/_test_lookup_list.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -from lookup_list import * -import unittest - -class T_LookUpList(unittest.TestCase): - def test_LookupList(self): - #test only the example in the doc??? - example = LookupList(['x','y','z'],[1,2,3]) - example['x'] = [1, 2, 3] # set or change a field - x, y, z = example - x = example[0] - x = example["x"] - assert example.keys()==['x','y','z'] - assert example.values()==[[1,2,3],2,3] - assert example.items()==[('x',[1,2,3]),('y',2),('z',3)] - example.append_keyval('u',0) # adds item with name 'u' and value 0 - assert len(example)==4 # number of items = 4 here - example2 = LookupList(['v','w'], ['a','b']) - example3 = LookupList(['x','y','z','u','v','w'], [[1, 2, 3],2,3,0,'a','b']) - assert example+example2==example3 - self.assertRaises(AssertionError,example.__add__,example) - del example, example2, example3, x, y ,z - -if __name__=='__main__': - unittest.main()
--- a/pylearn/old_dataset/dataset.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1533 +0,0 @@ - -from lookup_list import LookupList as Example -from common.misc import unique_elements_list_intersection -from string import join -from sys import maxint -import numpy, copy - -from exceptions import * - -class AttributesHolder(object): - def __init__(self): pass - - def attributeNames(self): - raise AbstractFunction() - - def setAttributes(self,attribute_names,attribute_values,make_copies=False): - """ - Allow the attribute_values to not be a list (but a single value) if the attribute_names is of length 1. - """ - if len(attribute_names)==1 and not (isinstance(attribute_values,list) or isinstance(attribute_values,tuple) ): - attribute_values = [attribute_values] - if make_copies: - for name,value in zip(attribute_names,attribute_values): - self.__setattr__(name,copy.deepcopy(value)) - else: - for name,value in zip(attribute_names,attribute_values): - self.__setattr__(name,value) - - def getAttributes(self,attribute_names=None, return_copy=False): - """ - Return all (if attribute_names=None, in the order of attributeNames()) or a specified subset of attributes. - """ - if attribute_names is None: - attribute_names = self.attributeNames() - if return_copy: - return [copy.copy(self.__getattribute__(name)) for name in attribute_names] - else: - return [self.__getattribute__(name) for name in attribute_names] - -class DataSet(AttributesHolder): - """A virtual base class for datasets. - - A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction - with learning algorithms (for training and testing them): rows/records are called examples, and - columns/attributes are called fields. The field value for a particular example can be an arbitrary - python object, which depends on the particular dataset. - - We call a DataSet a 'stream' when its length is unbounded (in which case its __len__ method - should return sys.maxint). - - A DataSet is a generator of iterators; these iterators can run through the - examples or the fields in a variety of ways. A DataSet need not necessarily have a finite - or known length, so this class can be used to interface to a 'stream' which - feeds on-line learning (however, as noted below, some operations are not - feasible or not recommended on streams). - - To iterate over examples, there are several possibilities: - - for example in dataset: - - for val1,val2,... in dataset: - - for example in dataset(field1, field2,field3, ...): - - for val1,val2,val3 in dataset(field1, field2,field3): - - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): - - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): - Each of these is documented below. All of these iterators are expected - to provide, in addition to the usual 'next()' method, a 'next_index()' method - which returns a non-negative integer pointing to the position of the next - example that will be returned by 'next()' (or of the first example in the - next minibatch returned). This is important because these iterators - can wrap around the dataset in order to do multiple passes through it, - in possibly unregular ways if the minibatch size is not a divisor of the - dataset length. - - To iterate over fields, one can do - - for field in dataset.fields(): - for field_value in field: # iterate over the values associated to that field for all the dataset examples - - for field in dataset(field1,field2,...).fields() to select a subset of fields - - for field in dataset.fields(field1,field2,...) to select a subset of fields - and each of these fields is iterable over the examples: - - for field_examples in dataset.fields(): - for example_value in field_examples: - ... - but when the dataset is a stream (unbounded length), it is not recommended to do - such things because the underlying dataset may refuse to access the different fields in - an unsynchronized ways. Hence the fields() method is illegal for streams, by default. - The result of fields() is a L{DataSetFields} object, which iterates over fields, - and whose elements are iterable over examples. A DataSetFields object can - be turned back into a DataSet with its examples() method:: - dataset2 = dataset1.fields().examples() - and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). - - Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. - - Note: The content of a field can be of any type. Field values can also be 'missing' - (e.g. to handle semi-supervised learning), and in the case of numeric (numpy array) - fields (i.e. an ArrayFieldsDataSet), NaN plays the role of a missing value. - What about non-numeric values? None. - - Dataset elements can be indexed and sub-datasets (with a subset - of examples) can be extracted. These operations are not supported - by default in the case of streams. - - - dataset[:n] returns an Example with the n first examples. - - - dataset[i1:i2:s] returns an Example with the examples i1,i1+s,...i2-s. - - - dataset[i] returns an Example. - - - dataset[[i1,i2,...in]] returns an Example with examples i1,i2,...in. - - A similar command gives you a DataSet instead of Examples : - - - dataset.subset[:n] returns a DataSet with the n first examples. - - - dataset.subset[i1:i2:s] returns a DataSet with the examples i1,i1+s,...i2-s. - - - dataset.subset[i] returns a DataSet. - - - dataset.subset[[i1,i2,...in]] returns a DataSet with examples i1,i2,...in. - - - - dataset.<property> returns the value of a property associated with - the name <property>. The following properties should be supported: - - 'description': a textual description or name for the dataset - - 'fieldtypes': a list of types (one per field) - A DataSet may have other attributes that it makes visible to other objects. These are - used to store information that is not example-wise but global to the dataset. - The list of names of these attributes is given by the attribute_names() method. - - Datasets can be concatenated either vertically (increasing the length) or - horizontally (augmenting the set of fields), if they are compatible, using - the following operations (with the same basic semantics as numpy.hstack - and numpy.vstack): - - - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) - - creates a new dataset whose list of fields is the concatenation of the list of - fields of the argument datasets. This only works if they all have the same length. - - - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) - - creates a new dataset that concatenates the examples from the argument datasets - (and whose length is the sum of the length of the argument datasets). This only - works if they all have the same fields. - - According to the same logic, and viewing a DataSetFields object associated to - a DataSet as a kind of transpose of it, fields1 & fields2 concatenates fields of - a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their - examples. - - A dataset can hold arbitrary key-value pairs that may be used to access meta-data - or other properties of the dataset or associated with the dataset or the result - of a computation stored in a dataset. These can be accessed through the [key] syntax - when key is a string (or more specifically, neither an integer, a slice, nor a list). - - A DataSet sub-class should always redefine the following methods: - - __len__ if it is not a stream - - fieldNames - - minibatches_nowrap (called by DataSet.minibatches()) - For efficiency of implementation, a sub-class might also want to redefine - - valuesHStack - - valuesVStack - - hasFields - - __getitem__ may not be feasible with some streams - - __iter__ - A sub-class should also append attributes to self._attribute_names - (the default value returned by attributeNames()). - By convention, attributes not in attributeNames() should have a name - starting with an underscore. - @todo enforce/test that convention! - """ - - numpy_vstack = lambda fieldname,values: numpy.vstack(values) - numpy_hstack = lambda fieldnames,values: numpy.hstack(values) - - def __init__(self, description=None, fieldnames=None, fieldtypes=None): - """ - @type fieldnames: list of strings - @type fieldtypes: list of python types, same length as fieldnames - @type description: string - @param description: description/name for this dataset - """ - def default_desc(): - return type(self).__name__ \ - + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )" - - #self.fieldnames = fieldnames - - self.fieldtypes = fieldtypes if fieldtypes is not None \ - else [None]*1 #len(fieldnames) - - self.description = default_desc() if description is None \ - else description - self._attribute_names = ["description"] - - - attributeNames = property(lambda self: copy.copy(self._attribute_names)) - - def __contains__(self, fieldname): - return (fieldname in self.fieldNames()) \ - or (fieldname in self.attributeNames()) - - def __iter__(self): - """Supports the syntax "for i in dataset: ..." - - Using this syntax, "i" will be an Example instance (or equivalent) with - all the fields of DataSet self. Every field of "i" will give access to - a field of a single example. Fields should be accessible via - i["fielname"] or i[3] (in the order defined by the elements of the - Example returned by this iterator), but the derived class is free - to accept any type of identifier, and add extra functionality to the iterator. - - The default implementation calls the minibatches iterator and extracts the first example of each field. - """ - return DataSet.MinibatchToSingleExampleIterator(self.minibatches(None, minibatch_size = 1)) - - def __len__(self): - """ - len(dataset) returns the number of examples in the dataset. - By default, a DataSet is a 'stream', i.e. it has an unbounded length (sys.maxint). - Sub-classes which implement finite-length datasets should redefine this method. - Some methods only make sense for finite-length datasets. - """ - from sys import maxint - return maxint - - - class MinibatchToSingleExampleIterator(object): - """ - Converts the result of minibatch iterator with minibatch_size==1 into - single-example values in the result. Therefore the result of - iterating on the dataset itself gives a sequence of single examples - (whereas the result of iterating over minibatches gives in each - Example field an iterable object over the individual examples in - the minibatch). - """ - def __init__(self, minibatch_iterator): - self.minibatch_iterator = minibatch_iterator - self.minibatch = None - def __iter__(self): #makes for loop work - return self - def next(self): - size1_minibatch = self.minibatch_iterator.next() - if not self.minibatch: - names = size1_minibatch.keys() - # next lines are a hack, but there was problem when we were getting [array(327)] for instance - try: - values = [value[0] for value in size1_minibatch.values()] - except : - values = [value for value in size1_minibatch.values()] - self.minibatch = Example(names,values) - else: - self.minibatch._values = [value[0] for value in size1_minibatch.values()] - return self.minibatch - - def next_index(self): - return self.minibatch_iterator.next_index() - - class MinibatchWrapAroundIterator(object): - """ - An iterator for minibatches that handles the case where we need to wrap around the - dataset because n_batches*minibatch_size > len(dataset). It is constructed from - a dataset that provides a minibatch iterator that does not need to handle that problem. - This class is a utility for dataset subclass writers, so that they do not have to handle - this issue multiple times, nor check that fieldnames are valid, nor handle the - empty fieldnames (meaning 'use all the fields'). - """ - def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset): - self.dataset=dataset - self.fieldnames=fieldnames - self.minibatch_size=minibatch_size - self.n_batches=n_batches - self.n_batches_done=0 - self.next_row=offset - self.L=len(dataset) - self.offset=offset % self.L - ds_nbatches = (self.L-self.next_row)/self.minibatch_size - if n_batches is not None: - ds_nbatches = min(n_batches,ds_nbatches) - if fieldnames: - assert dataset.hasFields(*fieldnames) - else: - self.fieldnames=dataset.fieldNames() - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row) - - def __iter__(self): - return self - - def next_index(self): - return self.next_row - - def next(self): - if self.n_batches and self.n_batches_done==self.n_batches: - raise StopIteration - elif not self.n_batches and self.next_row ==self.L: - raise StopIteration - upper = self.next_row+self.minibatch_size - if upper <=self.L: - minibatch = self.iterator.next() - else: - if not self.n_batches: - upper=min(upper, self.L) - # if their is not a fixed number of batch, we continue to the end of the dataset. - # this can create a minibatch that is smaller then the minibatch_size - assert (self.L-self.next_row)<=self.minibatch_size - minibatch = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() - else: - # we must concatenate (vstack) the bottom and top parts of our minibatch - # first get the beginning of our minibatch (top of dataset) - first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next() - second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next() - minibatch = Example(self.fieldnames, - [self.dataset.valuesVStack(name,[first_part[name],second_part[name]]) - for name in self.fieldnames]) - self.next_row=upper - self.n_batches_done+=1 - if upper >= self.L and self.n_batches: - self.next_row -= self.L - ds_nbatches = (self.L-self.next_row)/self.minibatch_size - if self.n_batches is not None: - ds_nbatches = min(self.n_batches,ds_nbatches) - self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, - ds_nbatches,self.next_row) - return DataSetFields(MinibatchDataSet(minibatch,self.dataset.valuesVStack, - self.dataset.valuesHStack), - minibatch.keys()) - - - minibatches_fieldnames = None - minibatches_minibatch_size = 1 - minibatches_n_batches = None - def minibatches(self, - fieldnames = minibatches_fieldnames, - minibatch_size = minibatches_minibatch_size, - n_batches = minibatches_n_batches, - offset = 0): - """ - Return an iterator that supports three forms of syntax: - - for i in dataset.minibatches(None,**kwargs): ... - - for i in dataset.minibatches([f1, f2, f3],**kwargs): ... - - for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... - - Using the first two syntaxes, "i" will be an indexable object, such as a list, - tuple, or Example instance. In both cases, i[k] is a list-like container - of a batch of current examples. In the second case, i[0] is - list-like container of the f1 field of a batch current examples, i[1] is - a list-like container of the f2 field, etc. - - Using the first syntax, all the fields will be returned in "i". - Using the third syntax, i1, i2, i3 will be list-like containers of the - f1, f2, and f3 fields of a batch of examples on each loop iteration. - - The minibatches iterator is expected to return upon each call to next() - a DataSetFields object, which is a Example (indexed by the field names) whose - elements are iterable and indexable over the minibatch examples, and which keeps a pointer to - a sub-dataset that can be used to iterate over the individual examples - in the minibatch. Hence a minibatch can be converted back to a regular - dataset or its fields can be looked at individually (and possibly iterated over). - - PARAMETERS - - fieldnames (list of any type, default None): - The loop variables i1, i2, i3 (in the example above) should contain the - f1, f2, and f3 fields of the current batch of examples. If None, the - derived class can choose a default, e.g. all fields. - - - minibatch_size (integer, default 1) - On every iteration, the variables i1, i2, i3 will have - exactly minibatch_size elements. e.g. len(i1) == minibatch_size - - @DEPRECATED n_batches : not used anywhere - - n_batches (integer, default None) - The iterator will loop exactly this many times, and then stop. If None, - the derived class can choose a default. If (-1), then the returned - iterator should support looping indefinitely. - - - offset (integer, default 0) - The iterator will start at example 'offset' in the dataset, rather than the default. - - Note: A list-like container is something like a tuple, list, numpy.ndarray or - any other object that supports integer indexing and slicing. - - @ATTENTION: now minibatches returns minibatches_nowrap, which is supposed to return complete - batches only, raise StopIteration. - @ATTENTION: minibatches returns a LookupList, we can't iterate over examples on it. - - """ - #return DataSet.MinibatchWrapAroundIterator(self, fieldnames, minibatch_size, n_batches,offset) - assert offset >= 0 - assert offset < len(self) - assert offset + minibatch_size -1 < len(self) - if fieldnames == None : - fieldnames = self.fieldNames() - return self.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - """ - This is the minibatches iterator generator that sub-classes must define. - It does not need to worry about wrapping around multiple times across the dataset, - as this is handled by MinibatchWrapAroundIterator when DataSet.minibatches() is called. - The next() method of the returned iterator does not even need to worry about - the termination condition (as StopIteration will be raised by DataSet.minibatches - before an improper call to minibatches_nowrap's next() is made). - That next() method can assert that its next row will always be within [0,len(dataset)). - The iterator returned by minibatches_nowrap does not need to implement - a next_index() method either, as this will be provided by MinibatchWrapAroundIterator. - """ - raise AbstractFunction() - - def is_unbounded(self): - """ - Tests whether a dataset is unbounded (e.g. a stream). - """ - return len(self)==maxint - - def hasFields(self,*fieldnames): - """ - Return true if the given field name (or field names, if multiple arguments are - given) is recognized by the DataSet (i.e. can be used as a field name in one - of the iterators). - - The default implementation may be inefficient (O(# fields in dataset)), as it calls the fieldNames() - method. Many datasets may store their field names in a dictionary, which would allow more efficiency. - """ - return len(unique_elements_list_intersection(fieldnames,self.fieldNames()))>0 - - def fieldNames(self): - """ - Return the list of field names that are supported by the iterators, - and for which hasFields(fieldname) would return True. - """ - raise AbstractFunction() - - def __call__(self,*fieldnames): - """ - Return a dataset that sees only the fields whose name are specified. - """ - assert self.hasFields(*fieldnames) - #return self.fields(*fieldnames).examples() - fieldnames_list = list(fieldnames) - return FieldsSubsetDataSet(self,fieldnames_list) - - def cached_fields_subset(self,*fieldnames) : - """ - Behaviour is supposed to be the same as __call__(*fieldnames), but the dataset returned is cached. - @see : dataset.__call__ - """ - assert self.hasFields(*fieldnames) - return self.fields(*fieldnames).examples() - - def fields(self,*fieldnames): - """ - Return a DataSetFields object associated with this dataset. - """ - return DataSetFields(self,fieldnames) - - def getitem_key(self, fieldname): - """A not-so-well thought-out place to put code that used to be in - getitem. - """ - #removing as per discussion June 4. --JSB - - i = fieldname - # else check for a fieldname - if self.hasFields(i): - return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0] - # else we are trying to access a property of the dataset - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] - - def __getitem__(self,i): - """ - @rtype: Example - @returns: single or multiple examples - - @type i: integer or slice or <iterable> of integers - @param i: - dataset[i] returns the (i+1)-th example of the dataset. - dataset[i:j] returns a LookupList with examples i,i+1,...,j-1. - dataset[i:j:s] returns a LookupList with examples i,i+2,i+4...,j-2. - dataset[[i1,i2,..,in]] returns a LookupList with examples i1,i2,...,in. - - @note: - Some stream datasets may be unable to implement random access, i.e. - arbitrary slicing/indexing because they can only iterate through - examples one or a minibatch at a time and do not actually store or keep - past (or future) examples. - - The default implementation of getitem uses the minibatches iterator - to obtain one example, one slice, or a list of examples. It may not - always be the most efficient way to obtain the result, especially if - the data are actually stored in a memory array. - """ - - if type(i) is int: - assert i >= 0 # TBM: see if someone complains and want negative i - if i >= len(self) : - raise IndexError - i_batch = self.minibatches_nowrap(self.fieldNames(), - minibatch_size=1, n_batches=1, offset=i) - return DataSet.MinibatchToSingleExampleIterator(i_batch).next() - - #if i is a contiguous slice - if type(i) is slice and (i.step in (None, 1)): - offset = 0 if i.start is None else i.start - upper_bound = len(self) if i.stop is None else i.stop - upper_bound = min(len(self) , upper_bound) - #return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(), - # minibatch_size=upper_bound - offset, - # n_batches=1, - # offset=offset).next()) - # now returns a LookupList - return self.minibatches_nowrap(self.fieldNames(), - minibatch_size=upper_bound - offset, - n_batches=1, - offset=offset).next() - - # if slice has a step param, convert it to list and handle it with the - # list code - if type(i) is slice: - offset = 0 if i.start is None else i.start - upper_bound = len(self) if i.stop is None else i.stop - upper_bound = min(len(self) , upper_bound) - i = list(range(offset, upper_bound, i.step)) - - # handle tuples, arrays, lists - if hasattr(i, '__getitem__'): - for idx in i: - #dis-allow nested slices - if not isinstance(idx, int): - raise TypeError(idx) - if idx >= len(self) : - raise IndexError - # call back into self.__getitem__ - examples = [self.minibatches_nowrap(self.fieldNames(), - minibatch_size=1, n_batches=1, offset=ii).next() - for ii in i] - # re-index the fields in each example by field instead of by example - field_values = [[] for blah in self.fieldNames()] - for e in examples: - for f,v in zip(field_values, e): - f.append(v) - #build them into a LookupList (a.ka. Example) - zz = zip(self.fieldNames(),field_values) - vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz] - example = Example(self.fieldNames(), vst) - #return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack) - # now returns a LookupList - return example - - # what in the world is i? - raise TypeError(i, type(i)) - - - """ - Enables the call dataset.subset[a:b:c] that will return a DataSet - around the examples returned by __getitem__(slice(a,b,c)) - - @SEE DataSet.__getsubset(self) - """ - subset = property(lambda s : s.__getsubset(),doc="returns a subset as a DataSet") - - - def __getsubset(self) : - """ - Enables the call data.subset[a:b:c], returns a DataSet. - Default implementation is a simple wrap around __getitem__() using MinibatchDataSet. - - @RETURN DataSet - @SEE DataSet.subset = property(lambda s : s.__getsubset()) - """ - _self = self - class GetSliceReturnsDataSet(object) : - def __getitem__(self,slice) : - return MinibatchDataSet(_self.__getitem__(slice)) - return GetSliceReturnsDataSet() - - - - def valuesHStack(self,fieldnames,fieldvalues): - """ - Return a value that corresponds to concatenating (horizontally) several field values. - This can be useful to merge some fields. The implementation of this operation is likely - to involve a copy of the original values. When the values are numpy arrays, the - result should be numpy.hstack(values). If it makes sense, this operation should - work as well when each value corresponds to multiple examples in a minibatch - e.g. if each value is a Ni-vector and a minibatch of length L is a LxNi matrix, - then the result should be a Lx(N1+N2+..) matrix equal to numpy.hstack(values). - The default is to use numpy.hstack for numpy.ndarray values, and a list - pointing to the original values for other data types. - """ - all_numpy=True - for value in fieldvalues: - if not type(value) is numpy.ndarray: - all_numpy=False - if all_numpy: - return numpy.hstack(fieldvalues) - # the default implementation of horizontal stacking is to put values in a list - return fieldvalues - - def valuesVStack(self,fieldname,values): - """ - @param fieldname: the name of the field from which the values were taken - @type fieldname: any type - - @param values: bits near the beginning or end of the dataset - @type values: list of minibatches (returned by minibatches_nowrap) - - @return: the concatenation (stacking) of the values - @rtype: something suitable as a minibatch field - """ - rval = [] - for v in values: - rval.extend(v) - return rval - - def __or__(self,other): - """ - dataset1 | dataset2 returns a dataset whose list of fields is the concatenation of the list of - fields of the argument datasets. This only works if they all have the same length. - """ - return HStackedDataSet([self,other]) - - def __and__(self,other): - """ - dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets - (and whose length is the sum of the length of the argument datasets). This only - works if they all have the same fields. - """ - return VStackedDataSet([self,other]) - -def hstack(datasets): - """ - hstack(dataset1,dataset2,...) returns dataset1 | datataset2 | ... - which is a dataset whose fields list is the concatenation of the fields - of the individual datasets. - """ - assert len(datasets)>0 - if len(datasets)==1: - return datasets[0] - return HStackedDataSet(datasets) - -def vstack(datasets): - """ - vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ... - which is a dataset which iterates first over the examples of dataset1, then - over those of dataset2, etc. - """ - assert len(datasets)>0 - if len(datasets)==1: - return datasets[0] - return VStackedDataSet(datasets) - -class FieldsSubsetDataSet(DataSet): - """ - A sub-class of L{DataSet} that selects a subset of the fields. - """ - def __init__(self,src,fieldnames): - self.src=src - self.fieldnames=fieldnames - assert src.hasFields(*fieldnames) - self.valuesHStack = src.valuesHStack - self.valuesVStack = src.valuesVStack - - def __len__(self): return len(self.src) - - def fieldNames(self): - return self.fieldnames - - def __iter__(self): - class FieldsSubsetIterator(object): - def __init__(self,ds): - self.ds=ds - self.src_iter=ds.src.__iter__() - self.example=None - def __iter__(self): return self - def next(self): - complete_example = self.src_iter.next() - if self.example: - self.example._values=[complete_example[field] - for field in self.ds.fieldnames] - else: - self.example=Example(self.ds.fieldnames, - [complete_example[field] for field in self.ds.fieldnames]) - return self.example - return FieldsSubsetIterator(self) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - assert self.hasFields(*fieldnames) - return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset) - def dontuse__getitem__(self,i): - return FieldsSubsetDataSet(self.src[i],self.fieldnames) - -class RenamedFieldsDataSet(DataSet): - """ - A sub-class of L{DataSet} that selects and renames a subset of the fields. - """ - def __init__(self,src,src_fieldnames,new_fieldnames): - self.src=src - self.src_fieldnames=src_fieldnames - self.new_fieldnames=new_fieldnames - assert src.hasFields(*src_fieldnames) - assert len(src_fieldnames)==len(new_fieldnames) - self.valuesHStack = src.valuesHStack - self.valuesVStack = src.valuesVStack - self.lookup_fields = Example(new_fieldnames,src_fieldnames) - - def __len__(self): return len(self.src) - - def fieldNames(self): - return self.new_fieldnames - - def __iter__(self): - class FieldsSubsetIterator(object): - def __init__(self,ds): - self.ds=ds - self.src_iter=ds.src.__iter__() - self.example=None - def __iter__(self): return self - def next(self): - complete_example = self.src_iter.next() - if self.example: - self.example._values=[complete_example[field] - for field in self.ds.src_fieldnames] - else: - self.example=Example(self.ds.new_fieldnames, - [complete_example[field] - for field in self.ds.src_fieldnames]) - return self.example - return FieldsSubsetIterator(self) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - assert self.hasFields(*fieldnames) - cursor = Example(fieldnames,[0]*len(fieldnames)) - for batch in self.src.minibatches_nowrap([self.lookup_fields[f] for f in fieldnames],minibatch_size,n_batches,offset): - cursor._values=batch._values - yield cursor - - def __getitem__(self,i): -# return FieldsSubsetDataSet(self.src[i],self.new_fieldnames) - complete_example = self.src[i] - return Example(self.new_fieldnames, - [complete_example[field] - for field in self.src_fieldnames]) - - - -class DataSetFields(Example): - """ - Although a L{DataSet} iterates over examples (like rows of a matrix), an associated - DataSetFields iterates over fields (like columns of a matrix), and can be understood - as a transpose of the associated dataset. - - To iterate over fields, one can do - * for fields in dataset.fields() - * for fields in dataset(field1,field2,...).fields() to select a subset of fields - * for fields in dataset.fields(field1,field2,...) to select a subset of fields - and each of these fields is iterable over the examples: - * for field_examples in dataset.fields(): - for example_value in field_examples: - ... - but when the dataset is a stream (unbounded length), it is not recommended to do - such things because the underlying dataset may refuse to access the different fields in - an unsynchronized ways. Hence the fields() method is illegal for streams, by default. - The result of fields() is a DataSetFields object, which iterates over fields, - and whose elements are iterable over examples. A DataSetFields object can - be turned back into a DataSet with its examples() method: - dataset2 = dataset1.fields().examples() - and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). - - DataSetFields can be concatenated vertically or horizontally. To be consistent with - the syntax used for DataSets, the | concatenates the fields and the & concatenates - the examples. - """ - def __init__(self,dataset,fieldnames): - original_dataset=dataset - if not fieldnames: - fieldnames=dataset.fieldNames() - elif not list(fieldnames)==list(dataset.fieldNames()): - #we must cast to list, othersize('x','y')!=['x','y'] - dataset = FieldsSubsetDataSet(dataset,fieldnames) - assert dataset.hasFields(*fieldnames) - self.dataset=dataset - - if isinstance(dataset,MinibatchDataSet): - Example.__init__(self,fieldnames,list(dataset._fields)) - elif isinstance(original_dataset,MinibatchDataSet): - Example.__init__(self,fieldnames, - [original_dataset._fields[field] - for field in fieldnames]) - else: - minibatch_iterator = dataset.minibatches(fieldnames, - minibatch_size=len(dataset), - n_batches=1) - minibatch=minibatch_iterator.next() - Example.__init__(self,fieldnames,minibatch) - - def examples(self): - return self.dataset - - def __or__(self,other): - """ - fields1 | fields2 is a DataSetFields that whose list of examples is the concatenation - of the list of examples of DataSetFields fields1 and fields2. - """ - return (self.examples() + other.examples()).fields() - - def __and__(self,other): - """ - fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation - of the fields of DataSetFields fields1 and fields2. - """ - return (self.examples() | other.examples()).fields() - - -class MinibatchDataSet(DataSet): - """ - Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset. - Each element of the lookup-list should be an iterable and sliceable, all of the same length. - """ - def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, - values_hstack=DataSet().valuesHStack): - """ - The user can (and generally should) also provide values_vstack(fieldname,fieldvalues) - and a values_hstack(fieldnames,fieldvalues) functions behaving with the same - semantics as the DataSet methods of the same name (but without the self argument). - """ - - self._fields=fields_lookuplist - assert len(fields_lookuplist)>0 - self.length=len(fields_lookuplist[0]) - for field in fields_lookuplist[1:]: - if self.length != len(field) : - print 'self.length = ',self.length - print 'len(field) = ', len(field) - print 'self._fields.keys() = ', self._fields.keys() - print 'field=',field - print 'fields_lookuplist=', fields_lookuplist - assert self.length==len(field) - self.valuesVStack=values_vstack - self.valuesHStack=values_hstack - - def __len__(self): - return self.length - - def dontuse__getitem__(self,i): - if type(i) in (slice,list): - return DataSetFields(MinibatchDataSet( - Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames()) - if type(i) is int: - return Example(self._fields.keys(),[field[i] for field in self._fields]) - if self.hasFields(i): - return self._fields[i] - assert i in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[i] - - def fieldNames(self): - return self._fields.keys() - - def hasFields(self,*fieldnames): - for fieldname in fieldnames: - if fieldname not in self._fields.keys(): - return False - return True - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - #@TODO bug somewhere here, fieldnames doesnt seem to be well handled - class Iterator(object): - def __init__(self,ds,fieldnames): - # tbm: added two next lines to handle fieldnames - if fieldnames is None: fieldnames = ds._fields.keys() - self.fieldnames = fieldnames - - self.ds=ds - self.next_example=offset - assert minibatch_size >= 0 - if offset+minibatch_size > ds.length: - raise NotImplementedError() - def __iter__(self): - return self - def next(self): - upper = self.next_example+minibatch_size - if upper > len(self.ds) : - raise StopIteration() - assert upper<=len(self.ds) # instead of self.ds.length - #minibatch = Example(self.ds._fields.keys(), - # [field[self.next_example:upper] - # for field in self.ds._fields]) - # tbm: modif to use fieldnames - values = [] - for f in self.fieldnames : - #print 'we have field',f,'in fieldnames' - values.append( self.ds._fields[f][self.next_example:upper] ) - minibatch = Example(self.fieldnames,values) - #print minibatch - self.next_example+=minibatch_size - return minibatch - - # tbm: added fieldnames to handle subset of fieldnames - return Iterator(self,fieldnames) - -class HStackedDataSet(DataSet): - """ - A L{DataSet} that wraps several datasets and shows a view that includes all their fields, - i.e. whose list of fields is the concatenation of their lists of fields. - - If a field name is found in more than one of the datasets, then either an error is - raised or the fields are renamed (either by prefixing the __name__ attribute - of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). - - @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... - """ - def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): - DataSet.__init__(self,description,field_types) - self.datasets=datasets - self.accept_nonunique_names=accept_nonunique_names - self.fieldname2dataset={} - - def rename_field(fieldname,dataset,i): - if hasattr(dataset,"__name__"): - return dataset.__name__ + "." + fieldname - return fieldname+"."+str(i) - - # make sure all datasets have the same length and unique field names - self.length=None - names_to_change=[] - for i in xrange(len(datasets)): - dataset = datasets[i] - length=len(dataset) - if self.length: - assert self.length==length - else: - self.length=length - for fieldname in dataset.fieldNames(): - if fieldname in self.fieldname2dataset: # name conflict! - if accept_nonunique_names: - fieldname=rename_field(fieldname,dataset,i) - names2change.append((fieldname,i)) - else: - raise ValueError("Incompatible datasets: non-unique field name = "+fieldname) - self.fieldname2dataset[fieldname]=i - for fieldname,i in names_to_change: - del self.fieldname2dataset[fieldname] - self.fieldname2dataset[rename_field(fieldname,self.datasets[i],i)]=i - - def __len__(self): - return len(self.datasets[0]) - - def hasFields(self,*fieldnames): - for fieldname in fieldnames: - if not fieldname in self.fieldname2dataset: - return False - return True - - def fieldNames(self): - return self.fieldname2dataset.keys() - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - - class HStackedIterator(object): - def __init__(self,hsds,iterators): - self.hsds=hsds - self.iterators=iterators - def __iter__(self): - return self - def next(self): - # concatenate all the fields of the minibatches - l=Example() - for iter in self.iterators: - l.append_lookuplist(iter.next()) - return l - - assert self.hasFields(*fieldnames) - # find out which underlying datasets are necessary to service the required fields - # and construct corresponding minibatch iterators - if fieldnames and fieldnames!=self.fieldNames(): - datasets=set([]) - fields_in_dataset=dict([(dataset,[]) for dataset in datasets]) - for fieldname in fieldnames: - dataset=self.datasets[self.fieldname2dataset[fieldname]] - datasets.add(dataset) - fields_in_dataset[dataset].append(fieldname) - datasets=list(datasets) - iterators=[dataset.minibatches(fields_in_dataset[dataset],minibatch_size,n_batches,offset) - for dataset in datasets] - else: - datasets=self.datasets - iterators=[dataset.minibatches(None,minibatch_size,n_batches,offset) for dataset in datasets] - return HStackedIterator(self,iterators) - - - def untested_valuesVStack(self,fieldname,fieldvalues): - return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues) - - def untested_valuesHStack(self,fieldnames,fieldvalues): - """ - We will use the sub-dataset associated with the first fieldname in the fieldnames list - to do the work, hoping that it can cope with the other values (i.e. won't care - about the incompatible fieldnames). Hence this heuristic will always work if - all the fieldnames are of the same sub-dataset. - """ - return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) - -class VStackedDataSet(DataSet): - """ - A L{DataSet} that wraps several datasets and shows a view that includes all their examples, - in the order provided. This clearly assumes that they all have the same field names - and all (except possibly the last one) are of finite length. - - @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... - """ - def __init__(self,datasets): - self.datasets=datasets - self.length=0 - self.index2dataset={} - assert len(datasets)>0 - fieldnames = datasets[-1].fieldNames() - self.datasets_start_row=[] - # We use this map from row index to dataset index for constant-time random access of examples, - # to avoid having to search for the appropriate dataset each time and slice is asked for. - for dataset,k in enumerate(datasets[0:-1]): - assert dataset.is_unbounded() # All VStacked datasets (except possibly the last) must be bounded (have a length). - L=len(dataset) - for i in xrange(L): - self.index2dataset[self.length+i]=k - self.datasets_start_row.append(self.length) - self.length+=L - assert dataset.fieldNames()==fieldnames - self.datasets_start_row.append(self.length) - self.length+=len(datasets[-1]) - # If length is very large, we should use a more memory-efficient mechanism - # that does not store all indices - if self.length>1000000: - # 1 million entries would require about 60 meg for the index2dataset map - # TODO - print "A more efficient mechanism for index2dataset should be implemented" - - def __len__(self): - return self.length - - def fieldNames(self): - return self.datasets[0].fieldNames() - - def hasFields(self,*fieldnames): - return self.datasets[0].hasFields(*fieldnames) - - def locate_row(self,row): - """Return (dataset_index, row_within_dataset) for global row number""" - dataset_index = self.index2dataset[row] - row_within_dataset = self.datasets_start_row[dataset_index] - return dataset_index, row_within_dataset - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - - class VStackedIterator(object): - def __init__(self,vsds): - self.vsds=vsds - self.next_row=offset - self.next_dataset_index,self.next_dataset_row=self.vsds.locate_row(offset) - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[0],offset,n_batches) - - def next_iterator(self,dataset,starting_offset,batches_left): - L=len(dataset) - ds_nbatches = (L-starting_offset)/minibatch_size - if batches_left is not None: - ds_nbatches = max(batches_left,ds_nbatches) - if minibatch_size>L: - ds_minibatch_size=L - n_left_in_mb=minibatch_size-L - ds_nbatches=1 - else: - n_left_in_mb=0 - return dataset.minibatches(fieldnames,minibatch_size,ds_nbatches,starting_offset), \ - L-(starting_offset+ds_nbatches*minibatch_size), n_left_in_mb - - def move_to_next_dataset(self): - if self.n_left_at_the_end_of_ds>0: - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[self.next_dataset_index], - self.n_left_at_the_end_of_ds,1) - else: - self.next_dataset_index +=1 - if self.next_dataset_index==len(self.vsds.datasets): - self.next_dataset_index = 0 - self.current_iterator,self.n_left_at_the_end_of_ds,self.n_left_in_mb= \ - self.next_iterator(vsds.datasets[self.next_dataset_index],starting_offset,n_batches) - - def __iter__(self): - return self - - def next(self): - dataset=self.vsds.datasets[self.next_dataset_index] - mb = self.next_iterator.next() - if self.n_left_in_mb: - extra_mb = [] - while self.n_left_in_mb>0: - self.move_to_next_dataset() - extra_mb.append(self.next_iterator.next()) - mb = Example(fieldnames, - [dataset.valuesVStack(name, - [mb[name]]+[b[name] for b in extra_mb]) - for name in fieldnames]) - - self.next_row+=minibatch_size - self.next_dataset_row+=minibatch_size - if self.next_row+minibatch_size>len(dataset): - self.move_to_next_dataset() - return examples - return VStackedIterator(self) - -class ArrayFieldsDataSet(DataSet): - """ - Virtual super-class of datasets whose field values are numpy array, - thus defining valuesHStack and valuesVStack for sub-classes. - """ - def __init__(self,description=None,field_types=None): - DataSet.__init__(self,description,field_types) - def untested_valuesHStack(self,fieldnames,fieldvalues): - """Concatenate field values horizontally, e.g. two vectors - become a longer vector, two matrices become a wider matrix, etc.""" - return numpy.hstack(fieldvalues) - def untested_valuesVStack(self,fieldname,values): - """Concatenate field values vertically, e.g. two vectors - become a two-row matrix, two matrices become a longer matrix, etc.""" - return numpy.vstack(values) - - - -class NArraysDataSet(ArrayFieldsDataSet) : - """ - An NArraysDataSet stores fields that are numpy tensor, whose first axis - iterates over examples. It's a generalization of ArrayDataSet. - """ - #@TODO not completely implemented yet - def __init__(self, data_arrays, fieldnames, **kwargs) : - """ - Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list - of fieldnames. The number of arrays must be the same as the number of - fieldnames. Each set of numpy tensor must have the same first dimension (first - axis) corresponding to the number of examples. - - Every tensor is treated as a numpy array (using numpy.asarray) - """ - ArrayFieldsDataSet.__init__(self,**kwargs) - assert len(data_arrays) == len(fieldnames) - assert len(fieldnames) > 0 - ndarrays = [numpy.asarray(a) for a in data_arrays] - lens = [a.shape[0] for a in ndarrays] - num_examples = lens[0] #they must all be equal anyway - self._fieldnames = fieldnames - for k in ndarrays : - assert k.shape[0] == num_examples - self._datas = ndarrays - # create dict - self.map_field_idx = dict() - for k in range(len(fieldnames)): - self.map_field_idx[fieldnames[k]] = k - - - def __len__(self) : - """ - Length of the dataset is based on the first array = data_arrays[0], using its shape - """ - return self._datas[0].shape[0] - - def fieldNames(self) : - """ - Returns the fieldnames as set in self.__init__ - """ - return self._fieldnames - - def field_pos(self,fieldname) : - """ - Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). - """ - return self.map_field_idx[fieldname] - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - cursor = Example(fieldnames,[0]*len(fieldnames)) - fieldnames = self.fieldNames() if fieldnames is None else fieldnames - for n in xrange(n_batches): - if offset == len(self): - break - for f in range(len(cursor._names)) : - idx = self.field_pos(cursor._names[f]) - sub_data = self._datas[idx][offset : offset+minibatch_size] - cursor._values[f] = sub_data - offset += len(sub_data) #can be less than minibatch_size at end - yield cursor - - #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - - - -class ArrayDataSet(ArrayFieldsDataSet): - """ - An ArrayDataSet stores the fields as groups of columns in a numpy tensor, - whose first axis iterates over examples, second axis determines fields. - If the underlying array is N-dimensional (has N axes), then the field - values are (N-2)-dimensional objects (i.e. ordinary numbers if N=2). - """ - - def __init__(self, data_array, fields_columns, **kwargs): - """ - Construct an ArrayDataSet from the underlying numpy array (data) and - a map (fields_columns) from fieldnames to field columns. The columns of a field are specified - using the standard arguments for indexing/slicing: integer for a column index, - slice for an interval of columns (with possible stride), or iterable of column indices. - """ - ArrayFieldsDataSet.__init__(self, **kwargs) - self.data=data_array - self.fields_columns=fields_columns - - # check consistency and complete slices definitions - for fieldname, fieldcolumns in self.fields_columns.items(): - if type(fieldcolumns) is int: - assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] - if 1: - #I changed this because it didn't make sense to me, - # and it made it more difficult to write my learner. - # If it breaks stuff, let's talk about it. - # - James 22/05/2008 - self.fields_columns[fieldname]=[fieldcolumns] - else: - self.fields_columns[fieldname]=fieldcolumns - elif type(fieldcolumns) is slice: - start,step=fieldcolumns.start,fieldcolumns.step - if not start: - start=0 - if not step: - step=1 - self.fields_columns[fieldname]=slice(start,fieldcolumns.stop,step) - elif hasattr(fieldcolumns,"__iter__"): # something like a list - for i in fieldcolumns: - assert i>=0 and i<data_array.shape[1] - - def fieldNames(self): - return self.fields_columns.keys() - - def __len__(self): - return len(self.data) - - def __getitem__(self,key): - """More efficient implementation than the default __getitem__""" - fieldnames=self.fields_columns.keys() - values=self.fields_columns.values() - if type(key) is int: - return Example(fieldnames, - [self.data[key,col] for col in values]) - if type(key) is slice: - return Example(fieldnames,[self.data[key,col] for col in values]) - if type(key) is list: - for i in range(len(key)): - if self.hasFields(key[i]): - key[i]=self.fields_columns[key[i]] - return Example(fieldnames, - #we must separate differently for list as numpy - # doesn't support self.data[[i1,...],[i2,...]] - # when their is more then two i1 and i2 - [self.data[key,:][:,col] - if isinstance(col,list) else - self.data[key,col] for col in values]) - - # else check for a fieldname - if self.hasFields(key): - return self.data[:,self.fields_columns[key]] - # else we are trying to access a property of the dataset - assert key in self.__dict__ # else it means we are trying to access a non-existing property - return self.__dict__[key] - - def dontuse__iter__(self): - class ArrayDataSetIteratorIter(object): - def __init__(self,dataset,fieldnames): - if fieldnames is None: fieldnames = dataset.fieldNames() - # store the resulting minibatch in a lookup-list of values - self.minibatch = Example(fieldnames,[0]*len(fieldnames)) - self.dataset=dataset - self.current=0 - self.columns = [self.dataset.fields_columns[f] - for f in self.minibatch._names] - self.l = self.dataset.data.shape[0] - def __iter__(self): - return self - def next(self): - #@todo: we suppose that we need to stop only when minibatch_size == 1. - # Otherwise, MinibatchWrapAroundIterator do it. - if self.current>=self.l: - raise StopIteration - sub_data = self.dataset.data[self.current] - self.minibatch._values = [sub_data[c] for c in self.columns] - - self.current+=1 - return self.minibatch - - return ArrayDataSetIteratorIter(self,self.fieldNames()) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - cursor = Example(fieldnames,[0]*len(fieldnames)) - fieldnames = self.fieldNames() if fieldnames is None else fieldnames - if n_batches == None: - n_batches = (len(self) - offset) / minibatch_size - for n in xrange(n_batches): - if offset == len(self): - break - sub_data = self.data[offset : offset+minibatch_size] - offset += len(sub_data) #can be less than minibatch_size at end - cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names] - yield cursor - - #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) - - -class CachedDataSet(DataSet): - """ - Wrap a L{DataSet} whose values are computationally expensive to obtain - (e.g. because they involve some computation, or disk access), - so that repeated accesses to the same example are done cheaply, - by caching every example value that has been accessed at least once. - - Optionally, for finite-length dataset, all the values can be computed - (and cached) upon construction of the CachedDataSet, rather at the - first access. - - @todo: when cache_all_upon_construction create mini-batches that are as - large as possible but not so large as to fill up memory. - - @todo: add disk-buffering capability, so that when the cache becomes too - big for memory, we cache things on disk, trying to keep in memory only - the record most likely to be accessed next. - """ - def __init__(self,source_dataset,cache_all_upon_construction=False): - self.source_dataset=source_dataset - self.cache_all_upon_construction=cache_all_upon_construction - self.cached_examples = [] - if cache_all_upon_construction: - # this potentially brings all the source examples - # into memory at once, which may be too much - # the work could possibly be done by minibatches - # that are as large as possible but no more than what memory allows. - # - # field_values is supposed to be an DataSetFields, that inherits from LookupList - #fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next() - fields_values = DataSetFields(source_dataset,None) - assert all([len(self)==len(field_values) for field_values in fields_values]) - for example in fields_values.examples(): - self.cached_examples.append(copy.copy(example)) - - self.fieldNames = source_dataset.fieldNames - self.hasFields = source_dataset.hasFields - self.valuesHStack = source_dataset.valuesHStack - self.valuesVStack = source_dataset.valuesVStack - - def __len__(self): - return len(self.source_dataset) - - def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): - class CacheIterator(object): - def __init__(self,dataset): - self.dataset=dataset - self.current=offset - self.all_fields = self.dataset.fieldNames()==fieldnames - self.n_batches = n_batches - self.batch_counter = 0 - def __iter__(self): return self - def next(self): - self.batch_counter += 1 - if self.n_batches and self.batch_counter > self.n_batches : - raise StopIteration() - upper = self.current+minibatch_size - if upper > len(self.dataset.source_dataset): - raise StopIteration() - cache_len = len(self.dataset.cached_examples) - if upper>cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - #for example in self.dataset.source_dataset[cache_len:upper]: - for example in self.dataset.source_dataset.subset[cache_len:upper]: - self.dataset.cached_examples.append(example) - all_fields_minibatch = Example(self.dataset.fieldNames(), - zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size])) - - self.current+=minibatch_size - if self.all_fields: - return all_fields_minibatch - return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames]) - return CacheIterator(self) - - def dontuse__getitem__(self,i): - if type(i)==int and len(self.cached_examples)>i: - return self.cached_examples[i] - else: - return self.source_dataset[i] - - def __iter__(self): - class CacheIteratorIter(object): - def __init__(self,dataset): - self.dataset=dataset - self.l = len(dataset) - self.current = 0 - self.fieldnames = self.dataset.fieldNames() - self.example = Example(self.fieldnames,[0]*len(self.fieldnames)) - def __iter__(self): return self - def next(self): - if self.current>=self.l: - raise StopIteration - cache_len = len(self.dataset.cached_examples) - if self.current>=cache_len: # whole minibatch is not already in cache - # cache everything from current length to upper - self.dataset.cached_examples.append( - self.dataset.source_dataset[self.current]) - self.example._values = self.dataset.cached_examples[self.current] - self.current+=1 - return self.example - - return CacheIteratorIter(self) - -class ApplyFunctionDataSet(DataSet): - """ - A L{DataSet} that contains as fields the results of applying a - given function example-wise or minibatch-wise to all the fields of - an input dataset. The output of the function should be an iterable - (e.g. a list or a LookupList) over the resulting values. - - The function take as input the fields of the dataset, not the examples. - - In minibatch mode, the function is expected to work on minibatches - (takes a minibatch in input and returns a minibatch in output). More - precisely, it means that each element of the input or output list - should be iterable and indexable over the individual example values - (typically these elements will be numpy arrays). All of the elements - in the input and output lists should have the same length, which is - the length of the minibatch. - - The function is applied each time an example or a minibatch is accessed. - To avoid re-doing computation, wrap this dataset inside a CachedDataSet. - - If the values_{h,v}stack functions are not provided, then - the input_dataset.values{H,V}Stack functions are used by default. - - """ - - def __init__(self,input_dataset,function,output_names,minibatch_mode=True, - values_hstack=None,values_vstack=None, - description=None,fieldtypes=None): - """ - Constructor takes an input dataset that has as many fields as the function - expects as inputs. The resulting dataset has as many fields as the function - produces as outputs, and that should correspond to the number of output names - (provided in a list). - - Note that the expected semantics of the function differs in minibatch mode - (it takes minibatches of inputs and produces minibatches of outputs, as - documented in the class comment). - - TBM: are fieldtypes the old field types (from input_dataset) or the new ones - (for the new dataset created)? - """ - self.input_dataset=input_dataset - self.function=function - self.output_names=output_names - #print 'self.output_names in afds:', self.output_names - #print 'length in afds:', len(self.output_names) - self.minibatch_mode=minibatch_mode - DataSet.__init__(self,description,fieldtypes) - self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack - self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack - - def __len__(self): - return len(self.input_dataset) - - def fieldNames(self): - return self.output_names - - def minibatches_nowrap(self, fieldnames, *args, **kwargs): - all_input_fieldNames = self.input_dataset.fieldNames() - mbnw = self.input_dataset.minibatches_nowrap - - for input_fields in mbnw(all_input_fieldNames, *args, **kwargs): - if self.minibatch_mode: - all_output_fields = self.function(*input_fields) - else: - input_examples = zip(*input_fields) #makes so that [i] means example i - output_examples = [self.function(*input_example) - for input_example in input_examples] - all_output_fields = zip(*output_examples) - - #print 'output_names=', self.output_names - #print 'all_output_fields', all_output_fields - #print 'len(all_output_fields)=', len(all_output_fields) - all_outputs = Example(self.output_names, all_output_fields) - if fieldnames==self.output_names: - rval = all_outputs - else: - rval = Example(fieldnames,[all_outputs[name] for name in fieldnames]) - #print 'rval', rval - #print '--------' - yield rval - - def untested__iter__(self): # only implemented for increased efficiency - class ApplyFunctionSingleExampleIterator(object): - def __init__(self,output_dataset): - self.current=0 - self.output_dataset=output_dataset - self.input_iterator=output_dataset.input_dataset.__iter__() - def __iter__(self): return self - def next(self): - if self.output_dataset.minibatch_mode: - function_inputs = [[input] for input in self.input_iterator.next()] - outputs = self.output_dataset.function(*function_inputs) - assert all([hasattr(output,'__iter__') for output in outputs]) - function_outputs = [output[0] for output in outputs] - else: - function_inputs = self.input_iterator.next() - function_outputs = self.output_dataset.function(*function_inputs) - return Example(self.output_dataset.output_names,function_outputs) - return ApplyFunctionSingleExampleIterator(self) - -def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): - """ - Wraps an arbitrary L{DataSet} into one for supervised learning tasks - by forcing the user to define a set of fields as the 'input' field - and a set of fields as the 'target' field. Optionally, a single - weight_field can also be defined. - """ - args = ((input_fields,'input'),(output_fields,'target')) - if weight_field: args+=(([weight_field],'weight')) - return src_dataset.merge_fields(*args) - - - -
--- a/pylearn/old_dataset/learner.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,135 +0,0 @@ - - -from exceptions import * -from dataset import AttributesHolder - -class OfflineLearningAlgorithm(object): - """ - Base class for offline learning algorithms, provides an interface - that allows various algorithms to be applicable to generic learning - algorithms. It is only given here to define the expected semantics. - - An offline learning algorithm can be seen as a function that when - applied to training data returns a learned function (which is an object that - can be applied to other data and return some output data). - - The offline learning scenario is the standard and most common one - in machine learning: an offline learning algorithm is applied - to a training dataset, - - model = learning_algorithm(training_set) - - resulting in a fully trained model that can be applied to another dataset - in order to perform some desired computation: - - output_dataset = model(input_dataset) - - Note that the application of a dataset has no side-effect on the model. - In that example, the training set may for example have 'input' and 'target' - fields while the input dataset may have only 'input' (or both 'input' and - 'target') and the output dataset would contain some default output fields defined - by the learning algorithm (e.g. 'output' and 'error'). The user may specifiy - what the output dataset should contain either by setting options in the - model, by the presence of particular fields in the input dataset, or with - keyword options of the __call__ method of the model (see LearnedModel.__call__). - - """ - - def __init__(self): pass - - def __call__(self, training_dataset): - """ - Return a fully trained TrainedModel. - """ - raise AbstractFunction() - -class TrainedModel(AttributesHolder): - """ - TrainedModel is a base class for models returned by instances of an - OfflineLearningAlgorithm subclass. It is only given here to define the expected semantics. - """ - def __init__(self): - pass - - def __call__(self,input_dataset,output_fieldnames=None, - test_stats_collector=None,copy_inputs=False, - put_stats_in_output_dataset=True, - output_attributes=[]): - """ - A L{TrainedModel} can be used with - with one or more calls to it. The main argument is an input L{DataSet} (possibly - containing a single example) and the result is an output L{DataSet} of the same length. - If output_fieldnames is specified, it may be use to indicate which fields should - be constructed in the output L{DataSet} (for example ['output','classification_error']). - Otherwise, some default output fields are produced (possibly depending on the input - fields available in the input_dataset). - Optionally, if copy_inputs, the input fields (of the input_dataset) can be made - visible in the output L{DataSet} returned by this method. - Optionally, attributes of the learner can be copied in the output dataset, - and statistics computed by the stats collector also put in the output dataset. - Note the distinction between fields (which are example-wise quantities, e.g. 'input') - and attributes (which are not, e.g. 'regularization_term'). - """ - raise AbstractFunction() - - -class OnlineLearningAlgorithm(object): - """ - Base class for online learning algorithms, provides an interface - that allows various algorithms to be applicable to generic online learning - algorithms. It is only given here to define the expected semantics. - - The basic setting is that the training data are only revealed in pieces - (maybe one example or a batch of example at a time): - - model = learning_algorithm() - - results in a fresh model. The model can be adapted by presenting - it with some training data, - - model.update(some_training_data) - ... - model.update(some_more_training_data) - ... - model.update(yet_more_training_data) - - and at any point one can use the model to perform some computation: - - output_dataset = model(input_dataset) - - The model should be a LearnerModel subclass instance, and LearnerModel - is a subclass of LearnedModel. - - """ - - def __init__(self): pass - - def __call__(self, training_dataset=None): - """ - Return a LearnerModel, either fresh (if training_dataset is None) or fully trained (otherwise). - """ - raise AbstractFunction() - -class LearnerModel(TrainedModel): - """ - LearnerModel is a base class for models returned by instances of a LearningAlgorithm subclass. - It is only given here to define the expected semantics. - """ - def __init__(self): - pass - - def update(self,training_set,train_stats_collector=None): - """ - Continue training a learner model, with the evidence provided by the given training set. - Hence update can be called multiple times. This is the main method used for training in the - on-line setting or the sequential (Bayesian or not) settings. - - This function has as side effect that self(data) will behave differently, - according to the adaptation achieved by update(). - - The user may optionally provide a training L{StatsCollector} that is used to record - some statistics of the outputs computed during training. It is update(d) during - training. - """ - raise AbstractFunction() -
--- a/pylearn/old_dataset/lookup_list.py Wed Oct 14 10:19:37 2009 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,134 +0,0 @@ - -from copy import deepcopy - -class LookupList(object): - """ - A LookupList is a sequence whose elements can be named (and unlike - a dictionary the order of the elements depends not on their key but - on the order given by the user through construction) so that - following syntactic constructions work as one would expect:: - >>> example = LookupList(['x','y','z'],[1,2,3]) - >>> example['x'] = [1, 2, 3] # set or change a field - >>> print example('z','y') # prints [3,2] - >>> x, y, z = example - >>> x = example[0] - >>> x = example["x"] - >>> print example.keys() # prints ['x','y','z'] - >>> print example.values() # prints [[1,2,3],2,3] - >>> print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)] - >>> example.append_keyval('u',0) # adds item with name 'u' and value 0 - >>> print len(example) # number of items = 4 here - >>> example2 = LookupList(['v', 'w'], ['a','b']) - >>> print example+example2 # addition is like for lists, a concatenation of the items. - >>> example + example # throw an error as we can't have duplicate name. - - @note: The element names should be unique. - - @todo: Convert this documentation into doctest - format, and actually perform doctest'ing: - U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks} - """ - def __init__(self,names=[],values=[]): - #print 'values=', values - #print 'length=', len(values) - #print 'names=', names - #print 'length=',len(names) - assert len(values)==len(names) - self.__dict__['_values']=values - self.__dict__['_name2index']={} - self.__dict__['_names']=names - for i in xrange(len(values)): - assert names[i] not in self._name2index - self._name2index[names[i]]=i - - def keys(self): - return self._names - - def values(self): - return self._values - - def items(self): - """ - Return a list of (name,value) pairs of all the items in the look-up list. - """ - return zip(self._names,self._values) - - def __getitem__(self,key): - """ - The key in example[key] can either be an integer to index the fields - or the name of the field. - """ - if isinstance(key,int) or isinstance(key,slice) or (isinstance(key,list) and all([isinstance(i,int) for i in key])): - return self._values[key] - else: # if not an int, key must be a name - # expecting key to be a valid field name - assert isinstance(key,str) - return self._values[self._name2index[key]] - - def __setitem__(self,key,value): - if isinstance(key,int): - self._values[key]=value - else: # if not an int, key must be a name - if key in self._name2index: - self._values[self._name2index[key]]=value - else: - self.append_keyval(key,value) - - def append_keyval(self, key, value): - assert key not in self._name2index - self._name2index[key]=len(self) - self._values.append(value) - self._names.append(key) - - def append_lookuplist(self, *list): - for l in list: - for key in l.keys(): - self.append_keyval(key,l[key]) - del l - - def __len__(self): - return len(self._values) - - def __repr__(self): - return "{%s}" % ", ".join([str(k) + "=" + repr(v) for k,v in self.items()]) - - def __add__(self,rhs): - new_example = deepcopy(self) - for item in rhs.items(): - new_example.append_keyval(item[0],item[1]) - return new_example - - def __radd__(self,lhs): - new_example = deepcopy(lhs) - for item in self.items(): - new_example.append_keyval(item[0],item[1]) - return new_example - - def __eq__(self, other): - return self._values==other._values and self._name2index==other._name2index and self._names==other._names - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - raise NotImplementedError() - - def __call__(self,*names): - """ - Return a list of values associated with the given names (which must all be keys of the lookup list). - """ - if names == self._names: - return self._values - return [self[name] for name in names] - - -if __name__ == '__main__': - - a=LookupList(['a'],[1]) - print a - b=LookupList(['b'],[2]) - print b - a.append_lookuplist(b) - print a - a.append_lookuplist(b) - print a
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/README.txt Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,2 @@ +The shared folder is for code taking advantage of Theano's shared-variable feature. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/README.txt Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,29 @@ + +Layers are the building blocks of neural networks. +Often they are parametric, but not necessarily. + +This directory is meant to be a library of layers and, where applicable, the +algorithms meant to fit them to data. + + +.. code-block:: python + + class Layer(object): + + """ Base class for Layer, documenting interface conventions + + WRITEME + """ + + input = None + + output = None + + l1 = 0 + + l2_sqr = 0 + + params = [] + + updates = [] +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/__init__.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,23 @@ +# logreg.py +from .logreg import LogisticRegression + +# sigmoidal_layer.py +from .sigmoidal_layer import SigmoidalLayer + +# exponential_mean.py +from .exponential_mean import ExponentialMean + +# sgd.py +from .sgd import StochasticGradientDescent, HalflifeStopper + +# kording +from kording2004 import Kording2004 + +# rust2005.py +from rust2005 import Rust2005 + +# lecun1998.py +from lecun1998 import LeNetConvPool + +# kouh2008.py +from kouh2008 import Kouh2008
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/exponential_mean.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,84 @@ + +"""Modules for maintaining statistics based on exponential decay""" +__docformat__ = "restructuredtext en" + +import copy +import numpy +import theano +import theano.tensor +from theano.compile.sandbox import shared + +class ExponentialMean(object): + """Maintain an exponentially-decaying estimate of the mean + + This module computes the exact mean of the first `max_denom` values of `x`. + After the first `max_denom` values, it tracks the mean using the formula: + + :math:`self.running <- (1.0 - (1.0/max_denom)) * self.running + (1.0/max_denom) * x` + + """ + + max_denom = None + """The average will be updated as if the current estimated average was estimated from at + most `max_denom-1` values.""" + + running = None + """Shared: The running mean statistic from which the output is computed.""" + + denom = None + """Shared: The number of examples we've updated from so far + """ + + def __init__(self, input, max_denom, ival): + """ + :param input: track the mean of this Variable + + :param max_denom: see `self.max_denom` + + :param ival: This should be a tensor of zeros with a shape that matches `input`'s runtime + value. + + """ + dtype=ival.dtype #dtype is an actual numpy dtype object, not a string + self.max_denom = max_denom + + if len(ival.shape) == 0: + input_type = theano.tensor.dscalar + elif len(ival.shape) == 1: + input_type = theano.tensor.dvector + elif len(ival.shape) == 2: + input_type = theano.tensor.dmatrix + else: + #TODO: x_type = theano.tensor.TensorType(...) + raise NotImplementedError() + + self.running = shared(numpy.array(ival, copy=True)) + # TODO: making this an lscalar caused different optimizations, followed by integer + # division somewhere were I wanted float division.... and the wrong answer. + self.denom = shared(numpy.asarray(1, dtype=dtype)) + + alpha = 1.0 / self.denom + self.output = (1.0 - alpha) * self.running + theano.tensor.cast(alpha * input, str(dtype)) + + self.updates = [ + (self.running, self.output), + (self.denom, theano.tensor.smallest(self.denom + 1, self.max_denom)), + ] + + assert self.output.type.dtype == dtype + + @classmethod + def new(cls, x, x_shape, max_denom, dtype='float64'): + """Return an `ExponentialMean` to track a Variable `x` with given shape + + :type x: Variable + :type x_shape: tuple + :type max_denom: int + :type dtype: string + :param dtype: the running average will be computed at this precision + + :rtype: ExponentialMean instance + """ + return cls(x, + max_denom=max_denom, + ival=numpy.zeros(x_shape, dtype=dtype))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/kording2004.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,127 @@ +import numpy +import theano.tensor +from hpu.theano_outgoing import mean, var, cov + +from .exponential_mean import ExponentialMean # exponential_mean.py + +import logging +_logger = logging.getLogger('kording2004') +def debug(*msg): _logger.debug(' '.join(str(m) for m in msg)) +def info(*msg): _logger.info(' '.join(str(m) for m in msg)) +def warn(*msg): _logger.warn(' '.join(str(m) for m in msg)) +def warning(*msg): _logger.warning(' '.join(str(m) for m in msg)) +def error(*msg): _logger.error(' '.join(str(m) for m in msg)) + +def cov_sum_of_squares(z, hint='tall', bias=0): + """Return the sum of the squares of all terms in the covariance of [normalized-and-centered] z + + :param hint: either 'tall' or 'fat' to indicate whether the computation should be carried + out on the gram matrix or in the covariance matrix. + + :note: This is computed using either the inner or outer-product depending on the `hint` + """ + denom = theano.tensor.cast(z.shape[0] if bias else (z.shape[0]-1), z.dtype) + if hint == 'fat': + return theano.tensor.sum(theano.tensor.dot(z, z.T)**2) / denom**2 + elif hint == 'tall': + return theano.tensor.sum(theano.tensor.dot(z.T, z)**2) / denom**2 + else: + raise ValueError(hint) + +def var_sum_of_squares(z, bias=0): + """Return the sum of squared variances in the columns of centered variable z + """ + denom = theano.tensor.cast((z.shape[0] if bias else (z.shape[0]-1)), z.dtype) + return theano.tensor.sum(theano.tensor.sum(z**2, axis=0)**2) / denom**2 + +def kording2004_normalized_decorrelation3(z, hint='fat'): + """Return the sum of the squares of the off-diagonal terms of an uncentered covariance + matrix + + :param z: a matrix of feature responses. Each row is the responses at one time-step. + + These features must have marginal mean 0 and variance 1 for this cost to make sense as a + training criterion. + + :note: This is computed using the gram matrix, not the covariance matrix + """ + assert z.ndim == 3 + zshape = z.shape + z2 = theano.tensor.reshape(z, [zshape[0]*zshape[1], zshape[2]]) + return cov_sum_of_squares(z2, hint=hint) - var_sum_of_squares(z2) + +def kording2004_normalized_slowness3(z, slowness_type='l2'): + """Return the average squared difference between each feature response and its previous + response. + + :param z: a 3-tensor of feature responses. Indexed [sequence][frame][feature] + + These features must have marginal mean 0 and variance 1 for this cost to make sense as a + training criterion. + """ + assert z.ndim == 3 + diff = (z[:,1:,:] - z[:,0:-1,:]) #the diff is taken over axis 1 + if slowness_type=='l2': + cost = diff**2 + elif slowness_type=='l1': + cost = abs(diff) + else: + raise ValueError(slowness_type) + rval = theano.tensor.mean(cost) + assert rval.ndim == 0 + return rval + +class Kording2004(object): + """This implements the Kording2004 cost using a dynamicly tracked mean, but not a + dynamically tracked variance. + + It is designed to accept 3-tensors, indexed like this: [movie_idx, frame_idx, feature_idx] + The variance in each feature will be computed over the outer two dimensions. + The speed of each feature will be computed as the first derivative over frame_idx, and the + mean over movie_idx. + + """ + + def __init__(self, input, (n_movies, n_frames, n_hid), slowness_multiplier, + slowness_type='l2', + eps=None, dtype='float64'): + info('Using Kording2004') + if eps == None: + if input.dtype == 'float64': + eps = numpy.asarray(1e-8, dtype=input.dtype) + else: + eps = numpy.asarray(1e-5, dtype=input.dtype) + assert input.ndim == 3 + self.input = input + self.n_hid = n_hid + self.n_movies = n_movies + self.n_frames = n_frames + self.slowness_multiplier = slowness_multiplier + cur_mean_input = mean(input, axis=[0,1]) + assert cur_mean_input.ndim == 1 + self.mean_input = ExponentialMean.new(cur_mean_input, x_shape=(n_hid,), max_denom=500, dtype=dtype) + + assert self.mean_input.output.dtype == dtype + + centered_input = self.input - self.mean_input.output #broadcasting over first 2 of 3 dims + var_input = theano.tensor.mean(centered_input**2, axis=0) + assert var_input.dtype == dtype + + z = centered_input / theano.tensor.sqrt(var_input + eps) + + assert z.dtype == dtype + self.z = z + + self.corr = kording2004_normalized_decorrelation3(z) + assert self.corr.dtype == dtype + self.slow = kording2004_normalized_slowness3(z, slowness_type=slowness_type) + assert self.slow.dtype == dtype + + print slowness_multiplier, type(slowness_multiplier), slowness_multiplier.dtype + assert self.slowness_multiplier.dtype == dtype + + self.output = self.slowness_multiplier * self.slow + self.corr + + self.params = [] + self.updates = list(self.mean_input.updates) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/kouh2008.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,119 @@ +""" +Paper: + +This layer implements a model of complex cell firing rate responses. + +Canonical neural circuit (Kouh and Poggio, 2008) + +This layer is in a sense a 2-layer neural network, with a strange activation function +in the middle. It is introduced in "A Canonical Neural Circuit for Cortical Nonlinear +Operations", NECO 2008. It includes various complex-cell models and approximates neural +network activation functions as special cases. + +""" + +## optimizing this model may be difficult-- paper talks about using exponents p and q in +# in the range 1-3, but gradient descent may overstep that range. + +# TODO: Use updates() to clamp exponents p and q to sensible range + +import numpy +import theano +from theano import tensor +from theano.tensor.nnet import softplus +from theano.compile.sandbox import shared +from .util import add_logging, update_locals + +def _shared_uniform(rng, low, high, size, dtype, name=None): + return shared( + numpy.asarray( + rng.uniform(low=low, high=high, size=size), + dtype=dtype), name) + +class Kouh2008(object): + """WRITEME + + :param x: a list of N non-negative tensors of shape (n_examples, n_out) + :param w: a list of N output weights of shape (n_out, ) + :param p: a tensor of exponents of shape (n_out,) + :param q: a tensor of exponents of shape (n_out,) + :param k: a tensor of biases of shape (n_out,) + + output - a tensor of activations of shape (n_examples, n_out) + """ + + def __init__(self, w_list, x_list, p, q, r, k, params, updates): + """Transcription of equation 2.1 from paper that appears on page 1434. + """ + if len(w_list) != len(x_list): + raise ValueError('w_list must have same len as x_list') + output = (sum(w * tensor.pow(x, p) for (w,x) in zip(w_list, x_list)))\ + / (k + tensor.pow(sum(tensor.pow(x, q) for x in x_list), r)) + + assert output.type.ndim == 2 + update_locals(self, locals()) + + @classmethod + def new(cls, rng, x_list, n_out, dtype=None, params=[], updates=[]): + """ + """ + if dtype is None: + dtype = x_list[0].dtype + n_terms = len(x_list) + + def shared_uniform(low, high, size, name): + return _shared_uniform(rng, low, high, size, dtype, name) + + w_list = [shared_uniform(low=-2.0/n_terms, high=2.0/n_terms, size=(n_out,), name='w_%i'%i) + for i in xrange(n_terms)] + p = shared_uniform(low=1.0, high=3.0, size=(n_out,), name='p') + q = shared_uniform(low=1.0, high=3.0, size=(n_out,), name='q') + r = shared_uniform(low=0.3, high=0.8, size=(n_out,), name='r') + k = shared_uniform(low=-0.3, high=0.3, size=(n_out,), name='k') + return cls(w_list, x_list, p, q, r, k, + params = [p, q, r, k] + w_list + params, + updates=updates) + + @classmethod + def new_filters(cls, rng, input, n_in, n_out, n_terms, dtype=None): + """Return a KouhLayer instance with random parameters + + The parameters are drawn on a range [typically] suitable for fine-tuning by gradient + descent. + + + :param input: a tensor of shape (n_examples, n_in) + + :type n_in: positive int + :param n_in: number of input dimensions + + :type n_out: positive int + :param n_out: number of dimensions in rval.output + + :param nterms: each (of n_out) complex-cell firing rate will be determined from this + many 'simple cell' responses. + + :returns: KouhLayer instance with freshly-allocated random weights. + + """ + if input.type.ndim != 2: + raise TypeError('matrix expected for input') + + if dtype is None: + dtype = input.dtype + + def shared_uniform(low, high, size, name): + return _shared_uniform(rng, low, high, size, dtype, name) + + f_list = [shared_uniform(low=-2.0/n_in, high=2.0/n_in, size=(n_in, n_out), name='f_%i'%i) + for i in xrange(n_terms)] + + x_list = [softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)] + + rval = cls.new(rng, x_list, n_out, dtype=dtype, params=f_list) + rval.input = input #add the input to the returned object + rval.l1 = sum(abs(fi).sum() for fi in f_list) + rval.l2_sqr = sum((fi**2).sum() for fi in f_list) + return rval + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/lecun1998.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,74 @@ +""" Provide the convolution and pooling layers described in LeCun 98 + +""" + +import numpy + +import theano +from theano import tensor +from theano.compile.sandbox import shared, pfunc + +from theano.sandbox.conv import ConvOp +from theano.sandbox.downsample import DownsampleFactorMax + +from .util import update_locals +from .squash import squash + +class LeNetConvPool(object): + """ + """ + + #TODO: implement biases & scales properly. There are supposed to be more parameters. + # - one bias & scale per filter + # - one bias & scale per downsample feature location (a 2d bias) + # - more? + + def __init__(self, input, w, b, conv_op, ds_op, squash_op, params): + if input.ndim != 4: + raise TypeError(input) + if w.ndim != 4: + raise TypeError(w) + if b.ndim != 1: + raise TypeError(b) + + conv_out = conv_op(input, w) + output = squash_op(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x')) + update_locals(self, locals()) + + @classmethod + def new(cls, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape, poolsize, + ignore_border=True, conv_subsample=(1,1), dtype=None, conv_mode='valid', + pool_type='max', squash_fn=tensor.tanh): + """ + """ + if pool_type != 'max': + # LeNet5 actually used averaging filters. Consider implementing 'mean' + # consider 'min' pooling? + # consider 'prod' pooling or some kind of geometric mean 'gmean'?? + raise NotImplementedError() + + if conv_subsample != (1,1): + # we need to adjust our calculation of the bias size + raise NotImplementedError() + + if dtype is None: + dtype = input.dtype + + if len(filter_shape) != 2: + raise TypeError(filter_shape) + + conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples, + dx=conv_subsample[0], dy=conv_subsample[1], output_mode=conv_mode) + ds_op = DownsampleFactorMax(poolsize, ignore_border=ignore_border) + + w_shp = (n_filters, n_imgs) + filter_shape + b_shp = (n_filters,) + + w = shared(numpy.asarray(rng.uniform(low=-.05, high=.05, size=w_shp), dtype=dtype)) + b = shared(numpy.asarray(rng.uniform(low=-.05, high=.05, size=b_shp), dtype=dtype)) + + if isinstance(squash_fn, str): + squash_fn = squash(squash_fn) + + return cls(input, w, b, conv_op, ds_op, squash_fn, [w,b]) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/logreg.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,47 @@ +"""Provides LogisticRegression +""" +import numpy +import theano +from theano.compile.sandbox import shared +from theano.tensor import nnet +from .util import update_locals, add_logging + +class LogisticRegression(object): + def __init__(self, input, w, b, params=[]): + output=nnet.softmax(theano.dot(input, w)+b) + l1=abs(w).sum() + l2_sqr = (w**2).sum() + argmax=theano.tensor.argmax(theano.dot(input, w)+b, axis=input.ndim-1) + update_locals(self, locals()) + + @classmethod + def new(cls, input, n_in, n_out, dtype=None): + if dtype is None: + dtype = input.dtype + cls._debug('allocating params w, b', n_in, n_out, dtype) + w = shared(numpy.zeros((n_in, n_out), dtype=dtype)) + b = shared(numpy.zeros((n_out,), dtype=dtype)) + return cls(input, w, b, params=[w,b]) + + + def nll(self, target): + """Return the negative log-likelihood of the prediction of this model under a given + target distribution. Passing symbolic integers here means 1-hot. + WRITEME + """ + return nnet.categorical_crossentropy(self.output, target) + + def errors(self, target): + """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. + """ + if target.ndim != self.argmax.ndim: + raise TypeError('target should have the same shape as self.argmax', ('target', target.type, + 'argmax', self.argmax.type)) + if target.dtype.startswith('int'): + return theano.tensor.neq(self.argmax, target) + else: + raise NotImplementedError() + +add_logging(LogisticRegression) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/rust2005.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,150 @@ +""" Provides Rust2005 layer + +Paper: + +This layer implements a model of simple and complex cell firing rate responses. + +""" + +import numpy +try: + from PIL import Image +except: + pass + +import theano +import theano.tensor +import theano.tensor.nnet +from theano.compile.sandbox import shared +from theano.sandbox.softsign import softsign +from theano.tensor.nnet import softplus + +from .util import update_locals, add_logging + +def rust2005_act_from_filters(linpart, E_quad, S_quad): + sqrt = theano.tensor.sqrt + softlin = theano.tensor.nnet.softplus(linpart) + E = sqrt(sum([E_quad_i**2 for E_quad_i in E_quad] + [1e-8, softlin**2])) + S = sqrt(sum([S_quad_i**2 for S_quad_i in S_quad] + [1e-8])) + return (E-S) / (1+E+S) + +class Rust2005(object): + """ + shared variable version. + + w is 3-tensor n_in x n_out x (1+n_E_quadratic + n_S_quadratic) + + w + + """ + #logging methods come from the add_logging() call below + # _info, _debug, _warn, _error, _fatal + + def __init__(self, input, w, b, n_out, n_E_quadratic, n_S_quadratic, + epsilon, filter_shape, params): + """ + w should be a matrix with input.shape[1] rows, and n_out * + (1+n_E_quadratic+n_S_quadratic) columns. + + Every successive block of (1+n_E_quadratic+n_S_quadratic) adjacent columns contributes + to the computation of one output features. The first column in the block is the filter + for the linear term. The following n_E_quadratic columns are used to compute the + exciting quadratic part. The following n_S_quadratic columns are used to compute the + inhibitory part. + """ + if w.dtype != input.dtype: + self._warn('WARNING w type mismatch', input.dtype, w.dtype, b.dtype) + if b.dtype != input.dtype: + self._warn( 'WARNING b type mismatch', input.dtype, w.dtype, b.dtype) + #when each column of w corresponds to a flattened shape, put it here. + # filter_shape is used for rendering weights as tiled images + + filter_responses = theano.dot(input, w).reshape(( + input.shape[0], + n_out, + 1 + n_E_quadratic + n_S_quadratic)) + + assert filter_responses.dtype == input.dtype + Lf = filter_responses[:, :, 0] + Ef = filter_responses[:,:, 1:1+n_E_quadratic] + Sf = filter_responses[:,:, 1+n_E_quadratic:] + assert Lf.dtype == input.dtype + + sqrt = theano.tensor.sqrt + E = sqrt((Ef**2).sum(axis=2) + epsilon + softplus(Lf+b)**2) + S = sqrt((Sf**2).sum(axis=2) + epsilon) + + output = (E-S) / (1+E+S) + assert output.dtype == input.dtype + Ef = Ef + Sf = Sf + E = E + S = S + + l1 = abs(w).sum() + l2_sqr = (w**2).sum() + + update_locals(self, locals()) + + @classmethod + def new(cls, input, n_in, n_out, n_E, n_S, rng, eps=1.0e-6, filter_shape=None, dtype=None): + """Allocate parameters and initialize them randomly. + """ + if dtype is None: + dtype = input.dtype + epsilon = numpy.asarray(eps, dtype=dtype) + w = shared(numpy.asarray( + rng.randn(n_in, n_out*(1 + n_E + n_S))*.3 / numpy.sqrt(n_in), + dtype=dtype)) + b = shared(numpy.zeros((n_out,), dtype=dtype)) + return cls(input, w, b, n_out, n_E, n_S, epsilon, filter_shape, [w,b]) + + def img_from_weights(self, rows=12, cols=24, row_gap=1, col_gap=1, eps=1e-4): + """ Return an image that visualizes all the weights in the layer. + + The current implentation returns a tiling in which every triple of columns is a logical + group. The first column in a triple has images of the linear weights. The second + column in a triple has images of the exciting quadratic weights. The third column in a + triple has images of the supressive quadratic weights. + + """ + if cols % 3: #because there are three kinds of filters: linear, excitatory, inhibitory + raise ValueError("cols must be multiple of 3") + filter_shape = self.filter_shape + height = rows * (row_gap + filter_shape[0]) - row_gap + width = cols * (col_gap + filter_shape[1]) - col_gap + + out_array = numpy.zeros((height, width, 3), dtype='uint8') + + w = self.w.value + w_col = 0 + def pixel_range(x): + return 255 * (x - x.min()) / (x.max() - x.min() + eps) + for r in xrange(rows): + out_r_low = r*(row_gap + filter_shape[0]) + out_r_high = out_r_low + filter_shape[0] + for c in xrange(cols): + out_c_low = c*(col_gap + filter_shape[1]) + out_c_high = out_c_low + filter_shape[1] + out_tile = out_array[out_r_low:out_r_high, out_c_low:out_c_high,:] + + if c % 3 == 0: # linear filter + if w_col < w.shape[1]: + out_tile[...] = pixel_range(w[:,w_col]).reshape(filter_shape+(1,)) + w_col += 1 + if c % 3 == 1: # E filters + if w_col < w.shape[1]: + #filters after the 3rd do not get rendered, but are skipped over. + # there are only 3 colour channels. + for i in xrange(min(self.n_E_quadratic,3)): + out_tile[:,:,i] = pixel_range(w[:,w_col+i]).reshape(filter_shape) + w_col += self.n_E_quadratic + if c % 3 == 2: # S filters + if w_col < w.shape[1]: + #filters after the 3rd do not get rendered, but are skipped over. + # there are only 3 colour channels. + for i in xrange(min(self.n_S_quadratic,3)): + out_tile[:,:,2-i] = pixel_range(w[:,w_col+i]).reshape(filter_shape) + w_col += self.n_S_quadratic + return Image.fromarray(out_array, 'RGB') +add_logging(Rust2005)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/sandbox/adelsonbergen87.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,80 @@ +""" WRITEME + +Paper: + +This is the so-called "Energy Model" of complex cell response. + +""" +import theano +import theano.tensor +import theano.tensor.nnet +from theano.sandbox.softsign import softsign + +try: + # for render_img + from pylearn.io.image_tiling import tile_raster_images + from PIL import Image +except: + pass + +def adelson_bergen_87(filter0, filter1, limit=0): + h = theano.tensor.sqrt(filter0**2 + filter1**2 + 1.0e-8) + if limit: + return limit * softsign((1.0/limit) * h) + else: + return h + +class AdelsonBergenLayer(theano.Module): + def __init__(self, x, + w=None, u=None, + w_val=None, u_val=None, limit=False): + super(AdelsonBergenLayer, self).__init__() + + self.w = theano.tensor.dmatrix() if w is None else w + self.u = theano.tensor.dmatrix() if u is None else u + + self._params = [self.w, self.u] + + self.w_val = w_val + self.u_val = u_val + self.limit = limit + + self.output = adelson_bergen_87(theano.dot(x, self.w), theano.dot(x, self.u), limit=self.limit) + + def _instance_initialize(self, obj): + obj.w = self.w_val.copy() + obj.u = self.u_val.copy() + + def l1(self): + return abs(self.w).sum() + abs(self.u).sum() + + def l2(self): + return theano.tensor.sqrt((self.w**2).sum() + (self.u**2).sum()) + + def params(self): + return list(self._params) + + def _instance_save_img(self, obj, filename, **kwargs): + obj.render_img(**kwargs).save(filename) + + def _instance_render_img(self, obj, img_shape, + tile_shape=(12,25), tile_spacing=(1,1)): + """ Render the weights of this module an image. + :param filename: save the image to this file + :param img_shape: interpret the columns of weight matrices as images of this shape + :param tile_shape: see pylearn.io.tile_raster_images + :param tile_spacing: see pylearn.io.tile_raster_images + """ + if (img_shape[0] * img_shape[1]) != obj.w.shape[0]: + raise ValueError("Image shape doesn't match filter column length") + return Image.fromarray( + tile_raster_images(( + obj.w.T, #RED + None, #GREEN + obj.u.T, #BLUE + None), #ALPHA + img_shape=img_shape, + tile_shape=tile_shape, + tile_spacing=tile_spacing), + 'RGBA') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/sandbox/linsvm.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,40 @@ +import numpy +import theano +from theano.compile.sandbox import shared +from theano.tensor import nnet +from .util import update_locals + +class LinearSVM(object): + def __init__(self, input, w, b, params=[]): + output=nnet.softmax(theano.dot(input, w)+b) + l1=abs(w).sum() + l2 = (w**2).sum() + argmax=theano.tensor.argmax(theano.dot(input, w)+b, axis=input.ndim-1) + update_locals(self, locals()) + + @classmethod + def new(cls, input, n_in, n_out): + w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype)) + b = shared(numpy.zeros((n_out,), dtype=input.dtype)) + return cls(input, w, b, params=[w,b]) + + + def margin(self, target): + """Return the negative log-likelihood of the prediction of this model under a given + target distribution. Passing symbolic integers here means 1-hot. + WRITEME + """ + raise NotImplementedError() + + def errors(self, target): + """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. + """ + if target.ndim != self.argmax.ndim: + raise TypeError('target should have the same shape as self.argmax', ('target', target.type, + 'argmax', self.argmax.type)) + if target.dtype.startswith('int'): + return theano.tensor.neq(self.argmax, target) + else: + raise NotImplementedError() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/sgd.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,146 @@ +""" +Provides StochasticGradientDescent, HalflifeStopper +""" +import numpy +import theano +from theano import tensor +from theano.compile.sandbox import shared + +class StochasticGradientDescent(object): + """Fixed stepsize gradient descent + + For given inputs, the outputs of this object are the new values that the inputs should take + in order to perform stochastic gradient descent. + + The updates attribute is a list of (p, new_p) pairs for all inputs `p` that are + SharedVariables. + + """ + def __init__(self, inputs, cost, stepsize, gradients, params): + """ + :param stepsize: the step to take in (negative) gradient direction + :type stepsize: None, scalar value, or scalar TensorVariable + + :param updates: extra symbolic updates to make when evating either step or step_cost + (these override the gradients if necessary) + :type updates: dict Variable -> Variable + :param auxout: auxiliary outputs, list containing output symbols to + compute at the same time as cost (for efficiency) + :param methods: Should this module define the step and step_cost methods? + """ + if len(inputs) != len(gradients): + raise ValueError('inputs list and gradients list must have same len') + + self.inputs = inputs + self.params = params + self.updates = updates = [] + self.outputs = outputs = [] + + for i, g in zip(inputs, gradients): + o = i - stepsize * g + outputs.append(o) + if hasattr(i, 'value'): # this is true for shared variables, false for most things. + updates.append((i, o)) + + @classmethod + def new(cls, inputs, cost, stepsize, dtype=None): + if dtype is None: + dtype = cost.dtype + + ginputs = tensor.grad(cost, inputs) + + if isinstance(stepsize, theano.Variable): + _stepsize = stepsize + params = [] + else: + _stepsize = shared(numpy.asarray(stepsize, dtype=dtype)) + params = [_stepsize] + + if _stepsize.type.ndim != 0: + raise TypeError('stepsize must be a scalar', stepsize) + + rval = cls(inputs, cost, _stepsize, ginputs, params) + + # if we allocated a shared variable for the stepsize, + # put it into the stepsize attribute. + if params: + rval.stepsize = _stepsize + + return rval + + +class HalflifeStopper(object): + """An early-stopping crition. + + This object will track the progress of a dynamic quantity along some noisy U-shaped + trajectory. + + The heuristic used is to first iterate at least `initial_wait` times, while looking at the + score. If at any point thereafter, the score hasn't made a *significant* improvement in the + second half of the entire run, the run is declared *not*-`promising`. + + Significant improvement in the second half of a run is defined as achieving + `progresh_thresh` proportion of the best score from the first half of the run. + + .. code-block:: python + + stopper = HalflifeStopper() + ... + while (...): + stopper.step(score) + if m.stopper.best_updated: + # this is the best score we've seen yet + if not m.stopper.promising: + # we haven't seen a good score in a long time, + # and the stopper recommends giving up. + break + + """ + def __init__(self, + initial_wait=20, + patience_factor=2.0, + progress_thresh=0.99 ): + """ + :param method: + :param method_output_idx: + :param initial_wait: + :param patience_factor: + :param progress_thresh: + """ + #constants + self.progress_thresh = progress_thresh + self.patience_factor = patience_factor + self.initial_wait = initial_wait + + #dynamic variables + self.iter = 0 + self.promising = True + + self.halflife_iter = -1 + self.halflife_value = float('inf') + self.halflife_updated = False + + self.best_iter = -1 + self.best_value = float('inf') + self.best_updated = False + + + def step(self, value): + if value < (self.halflife_value * self.progress_thresh): + self.halflife_updated = True + self.halflife_value = value + self.halflife_iter = self.iter + else: + self.halflife_updated = False + + if value < self.best_value: + self.best_updated = True + self.best_value = value + self.best_iter = self.iter + else: + self.best_updated = False + + self.promising = (self.iter < self.initial_wait) \ + or (self.iter < (self.halflife_iter * self.patience_factor)) + self.iter += 1 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/sigmoidal_layer.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,48 @@ +""" Provide the "normal" sigmoidal layers for making multi-layer perceptrons / neural nets + +""" +import logging +import numpy + +import theano +from theano import tensor +from theano.compile.sandbox import shared, pfunc +from .util import update_locals, add_logging +from .squash import squash + + +class SigmoidalLayer(object): + def __init__(self, input, w, b, squash_fn, params): + """ + :param input: a symbolic tensor of shape (n_examples, n_in) + :param w: a symbolic weight matrix of shape (n_in, n_out) + :param b: symbolic bias terms of shape (n_out,) + :param squash: an squashing function + """ + output = squash_fn(tensor.dot(input, w) + b) + update_locals(self, locals()) + + @classmethod + def new(cls, rng, input, n_in, n_out, squash_fn=tensor.tanh, dtype=None): + """Allocate a SigmoidLayer with weights to transform inputs with n_in dimensions, + to outputs of n_out dimensions. + + Weights are initialized randomly using rng. + + :param squash_fn: an op constructor function, or a string that has been registed as a + `squashing_function`. + + :param dtype: the numerical type to use for the parameters (i.e. 'float32', 'float64') + + """ + if dtype is None: + dtype = input.dtype + cls._debug('allocating weights and biases', n_in, n_out, dtype) + w = shared( + numpy.asarray( + rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in), + size=(n_in, n_out)), dtype=dtype)) + b = shared(numpy.asarray(numpy.zeros(n_out), dtype=dtype)) + return cls(input, w, b, squash(squash_fn), [w,b]) + +add_logging(SigmoidalLayer)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/squash.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,24 @@ +"""Provides a convenient lookup mechanism for squashing functions. +""" +_squash_dct = {} + +def squashing_function(f): + _squash_dct[f.__name__] = f + return f + +def squash(s): + try: + return _squash_dct[s] + except: + return s + +## initialize the dct + +import theano +import theano.tensor.nnet +import theano.sandbox.softsign + +_squash_dct['tanh'] = theano.tensor.tanh +_squash_dct['sigmoid'] = theano.tensor.nnet.sigmoid +_squash_dct['softsign'] = theano.sandbox.softsign.softsign +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/tests/test_kording2004.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,24 @@ + +from pylearn.shared.layers.kording2004 import * + +def test_cov_sum_of_squares(): + z = numpy.random.RandomState(5234).randn(15, 30) + z -= numpy.mean(z, axis=0) + z /= numpy.std(z, axis=0) + + cov_z = numpy.cov(z, rowvar=0) + print cov_z.shape + + real_val = numpy.sum(numpy.cov(z.T)**2) + + s = theano.tensor.dmatrix() + tall_val = theano.function([s], cov_sum_of_squares(s, 'tall'))(z) + fat_val = theano.function([s], cov_sum_of_squares(s, 'fat'))(z) + + print real_val + print tall_val + print fat_val + + assert numpy.allclose(real_val, tall_val) + assert numpy.allclose(real_val, fat_val) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/tests/test_kouh2008.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,107 @@ +import numpy +import theano.compile.debugmode +from theano import tensor +from theano.compile.sandbox import pfunc +from pylearn.shared.layers import LogisticRegression, Kouh2008 + +def test_dtype(): + n_in = 10 + n_out = 10 + n_terms = 3 + rng = numpy.random.RandomState(23455) + layer = Kouh2008.new_filters(rng, tensor.dmatrix(), n_in, n_out, n_terms, dtype='float64') + assert layer.output.dtype =='float64' + layer = Kouh2008.new_filters(rng, tensor.fmatrix(), n_in, n_out, n_terms, dtype='float32') + assert layer.output.dtype =='float32' + +def run_w_random(bsize=10, n_iter=200, n_in = 1024, n_out = 100, n_terms=2, dtype='float64'): + x = tensor.dmatrix() + y = tensor.lvector() + rng = numpy.random.RandomState(23455) + + layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64') + out = LogisticRegression.new(layer.output, n_out, 2) + cost = out.nll(y).sum() + + #isolated optimization + for ii in xrange(len(layer.params)): + params = out.params+ [layer.params[ii]] + print 'PARAMS', params + updates = [(p, p - numpy.asarray(0.001, dtype=dtype)*gp) for p,gp in zip(params, tensor.grad(cost, params)) ] + print 'COMPILING' + f = pfunc([x, y], cost, updates=updates) + print 'DONE' + if False: + for i, n in enumerate(f.maker.env.toposort()): + print i, n + + xval = numpy.asarray(rng.rand(bsize, n_in), dtype=dtype) + yval = numpy.asarray(rng.randint(0,2,bsize), dtype='int64') + f0 = f(xval, yval) + for i in xrange(n_iter): + fN = f(xval, yval) + assert fN < f0 + f0 = fN + if 0 == i % 5: print i, 'rval', fN + + return fN + +def test_A(bsize=10, n_iter=2, n_in = 10, n_out = 10, n_terms=2, dtype='float64'): + + x = tensor.dmatrix() + y = tensor.lvector() + rng = numpy.random.RandomState(23455) + + layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64') + out = LogisticRegression.new(layer.output, n_out, 2) + cost = out.nll(y).sum() + #joint optimization except for one of the linear filters + out.w.value += 0.1 * rng.rand(*out.w.value.shape) + params = layer.params[:-2] + mode = None + updates = [(p, p - numpy.asarray(0.001, dtype=dtype)*gp) for p,gp in zip(params, tensor.grad(cost, params)) ] + for p, newp in updates: + if p is layer.r: + theano.compile.debugmode.debugprint(newp, depth=5) + f = pfunc([x, y], [cost], mode, updates=updates) + env_r = f.maker.env.inputs[9] + order = f.maker.env.toposort() + + assert str(f.maker.env.outputs[6].owner.inputs[0]) == 'r' + assert str(f.maker.env.inputs[9]) == 'r' + assert f.maker.env.outputs[6].owner.inputs[0] is env_r + assert (f.maker.env.outputs[6].owner,0) in env_r.clients + + if False: + for i, n in enumerate(f.maker.env.toposort()): + print i, n, n.inputs + + xval = numpy.asarray(rng.rand(bsize, n_in), dtype=dtype) + yval = numpy.asarray(rng.randint(0,2,bsize), dtype='int64') + for i in xrange(n_iter): + fN = f(xval, yval) + if 0 == i: + f0 = fN + #if 0 == i % 5: print i, 'rval', fN + print i, 'rval', fN + for p0 in params: + for p1 in params: + assert p0 is p1 or not numpy.may_share_memory(p0.value, p1.value) + assert not numpy.may_share_memory(layer.r.value, xval) + print 'XVAL SUM', xval.sum(), layer.r.value.sum() + + assert f0 > 6 + assert fN < f0 # TODO: assert more improvement + +if __name__ == '__main__': + test_A() + +def test_smaller(): + assert run_w_random(n_in=10, n_out=8) < 6.1 + +def test_smaller32(): + assert run_w_random(n_in=10, n_out=8, dtype='float32') < 6.1 + +def test_big(): + assert run_w_random() < 0.1 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/tests/test_lecun1998.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,44 @@ +from pylearn.shared.layers.lecun1998 import * +from pylearn.shared.layers import LogisticRegression +import theano.sandbox.softsign + +def test_w_random(bsize=10, n_iter=100, dtype='float64'): + ishape=(28,28) + fshape=(5,5) + if dtype == 'float64': + x = tensor.dtensor4() + else: + x = tensor.ftensor4() + y = tensor.lvector() + rng = numpy.random.RandomState(23455) + + layer = LeNetConvPool.new(rng, x, bsize, 1, ishape, 6, fshape, (2,2)) + out = LogisticRegression.new(layer.output.flatten(2), 6*144, 2) + cost = out.nll(y).sum() + params = out.params+layer.params + updates = [(p, p - numpy.asarray(0.01, dtype=dtype)*gp) for p,gp in zip(params, tensor.grad(cost, params)) ] + f = pfunc([x, y], cost, updates=updates) + if True: + for i, n in enumerate(f.maker.env.toposort()): + print i, n + + xval = numpy.asarray(rng.rand(bsize, 1, ishape[0], ishape[1]), dtype=dtype) + yval = numpy.asarray(rng.randint(0,2,bsize), dtype='int64') + f0 = f(xval, yval) + for i in xrange(n_iter): + fN = f(xval, yval) + print i, 'rval', fN + + assert f0 > 6 + assert fN < .3 + + +def test_squash(): + ishape=(28,28) + fshape=(5,5) + x = tensor.ftensor4() + y = tensor.lvector() + rng = numpy.random.RandomState(23455) + + layer = LeNetConvPool.new(rng, x, 10, 1, ishape, 6, fshape, (2,2), squash_fn='softsign') + assert layer.squash_op == theano.sandbox.softsign.softsign
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/tests/test_sigmoidal_layer.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,41 @@ +import numpy +from pylearn.shared.layers import SigmoidalLayer, LogisticRegression +from theano import tensor +from theano.compile.sandbox import shared, pfunc + +def test_w_random(dtype='float64'): + if dtype == 'float64': + x = tensor.dmatrix() + else: + x = tensor.fmatrix() + y = tensor.lvector() + rng = numpy.random.RandomState(23455) + + bsize=10 + n_in = 10 + n_hid = 12 + n_out = 2 + n_iter=100 + + layer = SigmoidalLayer.new(rng, x, n_in, n_hid, squash_fn='tanh', dtype=dtype) + out = LogisticRegression.new(layer.output, n_hid, 2) + cost = out.nll(y).sum() + params = out.params+layer.params + updates = [(p, p - numpy.asarray(0.01, dtype=dtype)*gp) for p,gp in zip(params, tensor.grad(cost, params)) ] + f = pfunc([x, y], cost, updates=updates) + + w0 = layer.w.value.copy() + b0 = layer.b.value.copy() + + xval = numpy.asarray(rng.rand(bsize, n_in), dtype=dtype) + yval = numpy.asarray(rng.randint(0,2,bsize), dtype='int64') + f0 = f(xval, yval) + for i in xrange(n_iter): + fN = f(xval, yval) + print i, 'rval', fN + + assert f0 > 6 + assert fN < 2 + + assert numpy.all(w0 != layer.w.value) + assert numpy.all(b0 != layer.b.value)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/shared/layers/util.py Fri Oct 16 12:20:57 2009 -0400 @@ -0,0 +1,45 @@ +"""A few little internal functions""" +import logging + +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +def LogFn(f): + def rval(*args): + f(' '.join(str(a) for a in args)) + return staticmethod(rval) + +def add_logging(cls, name=None, level=None): + """ Add logging functions to a class: self._debug, self._info, self._warn, self._warning, + self._error. + + All of these functions has the same signature. They accept a variable number of positional + arguments, and print them all casted to string (and concatenated with a ' '.) + + :type name: str + :param name: the name of the logger. + + :type level: None, str, type(logging.INFO) + :param level: a logging level (e.g. logging.INFO), or the name of a logging level (e.g + 'INFO'). If level is None, then this function doesn't set the logging level. + + """ + if name is None: + name = "layers.%s" % cls.__name__ + cls._logger = logging.getLogger(name) + if level: + try: + level = getattr(logging, level) + except: + pass + cls._logger.setLevel(level) + + print 'adding loggers to ', cls + cls._debug = LogFn(cls._logger.debug) + cls._info = LogFn(cls._logger.info) + cls._warn = cls._warning = LogFn(cls._logger.warn) + cls._error = LogFn(cls._logger.error) + cls._critical = LogFn(cls._logger.critical) + cls._fatal = LogFn(cls._logger.fatal)