# HG changeset patch # User Joseph Turian # Date 1213906717 14400 # Node ID 18702ceb2096038b2e896ef03b64aba7758a45db # Parent 430c9e92cd23fcadf01769b406d1a87d3fe31210 Added more functions diff -r 430c9e92cd23 -r 18702ceb2096 _test_dataset.py --- a/_test_dataset.py Thu Jun 19 16:12:29 2008 -0400 +++ b/_test_dataset.py Thu Jun 19 16:18:37 2008 -0400 @@ -2,7 +2,7 @@ from dataset import * from math import * import numpy, unittest, sys -from misc import * +#from misc import * from lookup_list import LookupList def have_raised(to_eval, **var): diff -r 430c9e92cd23 -r 18702ceb2096 _test_onehotop.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_test_onehotop.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,21 @@ +from onehotop import one_hot + +import unittest +from theano import compile +from theano import gradient + +from theano.tensor import as_tensor + +import random +import numpy.random + +class T_OneHot(unittest.TestCase): + def test0(self): + x = as_tensor([3, 2, 1]) + y = as_tensor(5) + o = one_hot(x, y) + y = compile.eval_outputs([o]) + self.failUnless(numpy.all(y == numpy.asarray([[0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]]))) + +if __name__ == '__main__': + unittest.main() diff -r 430c9e92cd23 -r 18702ceb2096 _test_random_transformation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_test_random_transformation.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,84 @@ +from random_transformation import row_random_transformation + +import unittest +from theano import compile +from theano import gradient + +from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result +from theano.sparse import _mtypes, _mtype_to_str +from theano.sparse import as_sparse + +from theano.tensor import as_tensor +from theano.scalar import as_scalar + +import random +import numpy.random + +class T_RowRandomTransformation(unittest.TestCase): + def setUp(self): + random.seed(44) + numpy.random.seed(44) + + def test_basic(self): + rows = 4 + cols = 20 + fakeseed = 0 + length = 3 + md = numpy.random.rand(rows, cols) + for mtype in _mtypes: + m = as_sparse(mtype(md)) + o = row_random_transformation(m, length, initial_seed=fakeseed) + y = compile.eval_outputs([o]) + expected = "[[ 0.88239119 1.03244463 -1.29297503]\n [ 0.02644961 1.50119695 -0.025081 ]\n [-0.60741013 1.25424625 0.30119422]\n [-1.08659967 -0.35531544 -1.38915467]]" + self.failUnless(str(y) == expected) + + def test_length(self): + """ Test that if length is increased, we obtain the same results + (except longer). """ + + for i in range(10): + mtype = random.choice(_mtypes) + rows = random.randint(1, 20) + cols = random.randint(1, 20) + fakeseed = random.randint(0, 100) + length = random.randint(1, 10) + extralength = random.randint(1, 10) + + m = as_sparse(mtype(numpy.random.rand(rows, cols))) + o1 = row_random_transformation(m, length, initial_seed=fakeseed) + o2 = row_random_transformation(m, length + extralength, initial_seed=fakeseed) + + y1 = compile.eval_outputs([o1]) + y2 = compile.eval_outputs([o2]) + + self.failUnless((y1 == y2[:,:length]).all()) + + def test_permute(self): + """ Test that if the order of the rows is permuted, we obtain the same results. """ + for i in range(10): + mtype = random.choice(_mtypes) + rows = random.randint(2, 20) + cols = random.randint(1, 20) + fakeseed = random.randint(0, 100) + length = random.randint(1, 10) + + permute = numpy.random.permutation(rows) + + + m1 = numpy.random.rand(rows, cols) + m2 = m1[permute] + for r in range(rows): + self.failUnless((m2[r] == m1[permute[r]]).all()) + s1 = as_sparse(mtype(m1)) + s2 = as_sparse(mtype(m2)) + o1 = row_random_transformation(s1, length, initial_seed=fakeseed) + o2 = row_random_transformation(s2, length, initial_seed=fakeseed) + y1 = compile.eval_outputs([o1]) + y2 = compile.eval_outputs([o2]) + + self.failUnless(y1.shape == y2.shape) + for r in range(rows): + self.failUnless((y2[r] == y1[permute[r]]).all()) + +if __name__ == '__main__': + unittest.main() diff -r 430c9e92cd23 -r 18702ceb2096 common/__init__.py --- a/common/__init__.py Thu Jun 19 16:12:29 2008 -0400 +++ b/common/__init__.py Thu Jun 19 16:18:37 2008 -0400 @@ -1,1 +1,5 @@ -from * import * +import file +import floateq +import memory +import misc +import time diff -r 430c9e92cd23 -r 18702ceb2096 dataset.py --- a/dataset.py Thu Jun 19 16:12:29 2008 -0400 +++ b/dataset.py Thu Jun 19 16:18:37 2008 -0400 @@ -1,6 +1,6 @@ from lookup_list import LookupList as Example -from misc import unique_elements_list_intersection +from common.misc import unique_elements_list_intersection from string import join from sys import maxint import numpy, copy diff -r 430c9e92cd23 -r 18702ceb2096 featuremap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/featuremap.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,132 @@ +""" +Feature mapping. + +A feature map is idenfied by a unique name, e.g. "parsing features, experiment 35". +This unique name also determines the name of the on-disk version of the feature map. + +@todo: This should be rewritten to be more Pythonic. Perhaps use a class? +@todo: Maybe look at older C++ Id/Vocab code? Id could have a __str__ method +@todo: Clearer documentation. +@todo: Create an fmap directory +@todo: Use cPickle, not pickle + +@todo: Autosynchronize mode: Each time a new entry is added +to a L{FeatureMap}, the on-disk version of the feature map is +updated. Alternately, synchronize to disk when the object is destroyed. +""" + +from common import myopen +import pickle + +# We want this map to be a singleton +name_to_fmap = {} + +def get(name=None, synchronize=True): + """ + Get the L{FeatureMap} for a particular feature name. + """ + global name_to_fmap + if name not in name_to_fmap: + # Create a new L{FeatureMap} + name_to_fmap[name] = FeatureMap(name, synchronize) + fmap = name_to_fmap[name] + assert fmap.name == name + assert fmap.synchronize == synchronize + return fmap + +def free_memory(): + """ + Free the memory associated with all feature maps. + """ + global name_to_fmap + name_to_fmap = {} + +class KeyError(Exception): + """Exception raised for keys missing from a readonly FeatureMap + Attributes: + name -- Name of the FeatureMap raising the error. + key -- Key not present. + """ + def __init__(self, name, key): + self.name = name + self.key = key + + +class FeatureMap: + """ + Map from a feature string to a numerial ID (starting from 0). + + If synchronize is False, the feature map is considered temporary + and we never actually synchronize it with disk. It expires with the + lifetime of this execution. + + @warning: Do not construct this directly. Instead, use the global get() method. + @todo: More documentation + """ + +# name = None +# synchronize = True +# map = {} +# readonly = False # If True, then each time we look for an ID + # that is not present we throw a ValueError + def __init__(self, name=None, synchronize=True): + self.name = name + self.synchronize = synchronize + self.map = {} + self.reverse_map = {} + self.readonly = False + + # There must be a name provided, or we cannot perform synchronization + assert self.name or not self.synchronize + + if self.synchronize: + # Try loading map from disk + self.load() + + def exists(self, str): + """ Return True iff this str is in the map """ + return str in self.map + + def id(self, str): + """ Get the ID for this string. Add a new ID if not is available """ + """ @todo: Don't want to synchronize every add, this may be too slow. """ + if str not in self.map: + if self.readonly: raise KeyError(self.name, str) + l = self.len + self.map[str] = l + self.reverse_map[l] = str + assert l+1 == self.len + return l + else: return self.map[str] + + def str(self, id): + """ Get the string for this ID. """ + return self.reverse_map[id] + + # This next function should just convert a list to a list +# def ids(self, lst): +# """ Get the IDs for the elements of a list. Return the ID numbers of these keys as a map. """ +# idset = {} +# for k in lst: +# try: +# idset[self.id(k)] = True +# except KeyError, e: +# print "Feature map '%s' does not contain key '%s'. Skipping..." % (e.name, e.key) +# return idset + + len = property(lambda self: len(self.map), doc="Number of different feature IDs") + filename = property(lambda self: "fmap.%s.pkl.gz" % self.name, doc="The on-disk file synchronized to this feature map.") + + def load(self): + """ Load the map from disk. """ + assert self.synchronize + try: + f = myopen(self.filename, "rb") + (self.map, self.reverse_map) = pickle.load(f) + except IOError: print "Could not open %s" % self.filename + + def dump(self): + """ Dump the map to disk. """ + assert self.synchronize + f = myopen(self.filename, "wb") + pickle.dump((self.map, self.reverse_map), f) diff -r 430c9e92cd23 -r 18702ceb2096 misc.py diff -r 430c9e92cd23 -r 18702ceb2096 onehotop.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/onehotop.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,58 @@ +""" +One hot Op +""" + +#from theano import tensor +from theano.tensor import as_tensor, Tensor +from theano.gof import op +from theano.gof.graph import Apply + +import numpy + +class OneHot(op.Op): + """ + Construct a one-hot vector, x out of y. + + @todo: Document inputs and outputs + @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64! + @todo: Use 'bool' as output dtype, not 'int64' ? + @todo: Allow this to operate on column vectors (Tensor) + @todo: Describe better. + """ + + def make_node(self, x, y): + """ + @type x: Vector L{Tensor} of integers + @param x: The entries of the one-hot vector to be one. + @type y: Integer scalar L{Tensor} + @param y: The length (#columns) of the one-hot vectors. + @return: A L{Tensor} of one-hot vectors + + @precondition: x < y for all entries of x + @todo: Check that x and y are int types + """ + x = as_tensor(x) + y = as_tensor(y) + #assert x.dtype[0:3] == "int" + #assert y.dtype[0:3] == "int" + inputs = [x, y] + ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])] + #outputs = [tensor.Tensor("float64", broadcastable=[False, False])] + #outputs = [Tensor("int64", broadcastable=[False, False])] + outputs = [Tensor("float64", broadcastable=[False, False]).make_result()] + node = Apply(op = self, inputs = inputs, outputs = outputs) + return node + + def perform(self, node, (x, y), (out, )): + assert x.dtype == "int64" or x.dtype == "int32" + assert x.ndim == 1 + assert y.dtype == "int64" or x.dtype == "int32" + assert y.ndim == 0 + out[0] = numpy.zeros((x.shape[0], y), dtype="float64") + for c in range(x.shape[0]): + assert x[c] < y + out[0][c, x[c]] = 1 + + def grad(self, (x, y), (out_gradient, )): + return None, None +one_hot = OneHot() diff -r 430c9e92cd23 -r 18702ceb2096 onehotop.py.scalar --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/onehotop.py.scalar Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,64 @@ +""" +One hot Op +""" + +#from theano import tensor +from theano.tensor import as_tensor, Tensor +#from theano import scalar +from theano.scalar import as_scalar +from theano.gof import op +from theano.gof.graph import Apply + +import numpy + +class OneHot(op.Op): + """ + Construct a one-hot vector, x out of y. + + @todo: Document inputs and outputs + @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64! + @todo: Use 'bool' as output dtype, not 'int64' ? + @todo: Allow this to operate on column vectors (Tensor) + @todo: Describe better. + @todo: What type is y? + @todo: What about operating on L{Scalar}s? + """ + + def make_node(self, x, y): + """ + @type x: Vector L{Tensor} of integers + @param x: The entries of the one-hot vector to be one. + @type y: Integer L{Scalar} + @param y: The length (#columns) of the one-hot vectors. + @return: A L{Tensor} of one-hot vectors + + @precondition: x < y for all entries of x + @todo: Check that x and y are int types + """ + #x = tensor.as_tensor(x) + #y = scalar.as_scalar(y) + x = as_tensor(x) + y = as_scalar(y) + #assert x.dtype[0:3] == "int" + #assert y.dtype[0:3] == "int" + inputs = [x, y] + ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])] + #outputs = [tensor.Tensor("float64", broadcastable=[False, False])] + #outputs = [Tensor("int64", broadcastable=[False, False])] + outputs = [Tensor("float64", broadcastable=[False, False]).make_result()] + node = Apply(op = self, inputs = inputs, outputs = outputs) + return node + + def perform(self, node, (x, y), (out, )): + assert x.dtype == "int64" + assert type(y) == numpy.int64 + assert x.ndim == 1 + #out = numpy.zeros((x.shape[0], y), dtype="int64") + out[0] = numpy.zeros((x.shape[0], y), dtype="float64") + for c in range(x.shape[0]): + assert x[c] < y + out[0][c, x[c]] = 1 + + def grad(self, (x, y), (out_gradient, )): + return None, None +one_hot = OneHot() diff -r 430c9e92cd23 -r 18702ceb2096 random_transformation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/random_transformation.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,132 @@ +""" +New L{Op}s that aren't in core theano +""" + +from theano import sparse +from theano import tensor +from theano import scalar +from theano.gof import op + +from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result + +import scipy.sparse + +import numpy + +class RowRandomTransformation(op.Op): + """ + Given C{x}, a (sparse) matrix with shape (exmpls, dimensions), we + multiply it by a deterministic random matrix of shape (dimensions, + length) to obtain random transformation output of shape (exmpls, + length). + + Each element of the deterministic random matrix is selected uniformly + from [-1, +1). + @todo: Use another random distribution? + + @note: This function should be written such that if length is + increased, we obtain the same results (except longer). Similarly, + the rows should be able to be permuted and get the same result in + the same fashion. + + @todo: This may be slow? + @todo: Rewrite for dense matrices too? + @todo: Is there any way to verify the convention that each row is + an example? Should I rename the variables in the code to make the + semantics more explicit? + @todo: AUTOTEST: Autotest that dense and spare versions of this are identical. + @todo: Rename? Is Row the correct name? Maybe column-wise? + + @type x: L{scipy.sparse.spmatrix} + @param x: Sparse matrix to be randomly transformed with shape (exmpls, dimensions) + @type length: int + @param length: The number of transformations of C{x} to be performed. + @param initial_seed: Initial seed for the RNG. + @rtype: L{numpy.ndarray} + @return: Array with C{length} random transformations, with shape (exmpls, length) + """ + + import random + """ + RNG used for random transformations. + Does not share state with rest of program. + @todo: Make STATIC and private. Ask James or Olivier how to make this more Pythonic. + """ + _trng = random.Random() + + def __init__(self, x, length, initial_seed=0, **kwargs): + """ + @todo: Which broadcastable values should I use? + """ + assert 0 # Needs to be updated to Olivier's new Op creation approach + op.Op.__init__(self, **kwargs) + x = sparse.as_sparse(x) + self.initial_seed = initial_seed + self.length = length + self.inputs = [x] + self.outputs = [tensor.Tensor(x.dtype, broadcastable=[False, False])] +# self.outputs = [tensor.Tensor(x.dtype, broadcastable=[True, True])] + + def _random_matrix_value(self, row, col, rows): + """ + From a deterministic random matrix, find one element. + @param row: The row of the element to be read. + @param col: The column of the element to be read. + @param row: The number of rows in the matrix. + @type row: int + @type col: int + @type rows: int + @note: This function is designed such that if we extend + the number of columns in the random matrix, the values of + the earlier entries is unchanged. + @todo: Make this static + """ + # Choose the random entry at (l, c) + rngidx = col * rows + row + # Set the random number state for this random entry + # Note: This may be slow + self._trng.seed(rngidx + self.initial_seed) + + # Determine the value for this entry + val = self._trng.uniform(-1, +1) +# print "Exmpl #%d, dimension #%d => Random projection #%d has idx %d (+ seed %d) and value %f" % (r, c, j, rngidx, self.initial_seed, val) + return val + + def impl(self, xorig): + assert _is_sparse(xorig) + assert len(xorig.shape) == 2 + # Since conversions to and from the COO format are quite fast, you + # can use this approach to efficiently implement lots computations + # on sparse matrices. + x = xorig.tocoo() + (rows, cols) = x.shape + tot = rows * cols + out = numpy.zeros((rows, self.length)) +# print "l = %d" % self.length +# print "x.getnnz() = %d" % x.getnnz() + all = zip(x.col, x.row, x.data) + all.sort() # TODO: Maybe this is very slow? + lastc = None + lastl = None + lastval = None + for l in range(self.length): + for (c, r, data) in all: + assert c < cols + assert r < rows + if not c == lastc or not l == lastl: + lastc = c + lastl = l + lastval = self._random_matrix_value(c, l, cols) + val = lastval +# val = self._random_matrix_value(c, l, cols) +# val = self._trng.uniform(-1, +1) +# val = 1.0 + out[r][l] += val * data + return out + def __copy__(self): + return self.__class__(self.inputs[0], self.length, self.initial_seed) + def clone_with_new_inputs(self, *new_inputs): + return self.__class__(new_inputs[0], self.length, self.initial_seed) + def desc(self, *new_inputs): + return (self.__class__, self.length, self.initial_seed) +row_random_transformation = RowRandomTransformation()