changeset 356:18702ceb2096

Added more functions
author Joseph Turian <turian@iro.umontreal.ca>
date Thu, 19 Jun 2008 16:18:37 -0400
parents 430c9e92cd23
children 2291a244a887
files _test_dataset.py _test_onehotop.py _test_random_transformation.py common/__init__.py dataset.py featuremap.py misc.py onehotop.py onehotop.py.scalar random_transformation.py
diffstat 9 files changed, 498 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Thu Jun 19 16:12:29 2008 -0400
+++ b/_test_dataset.py	Thu Jun 19 16:18:37 2008 -0400
@@ -2,7 +2,7 @@
 from dataset import *
 from math import *
 import numpy, unittest, sys
-from misc import *
+#from misc import *
 from lookup_list import LookupList
 
 def have_raised(to_eval, **var):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_onehotop.py	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,21 @@
+from onehotop import one_hot
+
+import unittest
+from theano import compile
+from theano import gradient
+
+from theano.tensor import as_tensor
+
+import random
+import numpy.random
+
+class T_OneHot(unittest.TestCase):
+    def test0(self):
+        x = as_tensor([3, 2, 1])
+        y = as_tensor(5)
+        o = one_hot(x, y)
+        y = compile.eval_outputs([o])
+        self.failUnless(numpy.all(y == numpy.asarray([[0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0]])))
+
+if __name__ == '__main__':
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_test_random_transformation.py	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,84 @@
+from random_transformation import row_random_transformation
+
+import unittest
+from theano import compile
+from theano import gradient
+
+from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result
+from theano.sparse import _mtypes, _mtype_to_str
+from theano.sparse import as_sparse
+
+from theano.tensor import as_tensor
+from theano.scalar import as_scalar
+
+import random
+import numpy.random
+
+class T_RowRandomTransformation(unittest.TestCase):
+    def setUp(self):
+        random.seed(44)
+        numpy.random.seed(44)
+
+    def test_basic(self):
+        rows = 4
+        cols = 20
+        fakeseed = 0
+        length = 3 
+        md = numpy.random.rand(rows, cols)
+        for mtype in _mtypes:
+            m = as_sparse(mtype(md))
+            o = row_random_transformation(m, length, initial_seed=fakeseed)
+            y = compile.eval_outputs([o])
+            expected = "[[ 0.88239119  1.03244463 -1.29297503]\n [ 0.02644961  1.50119695 -0.025081  ]\n [-0.60741013  1.25424625  0.30119422]\n [-1.08659967 -0.35531544 -1.38915467]]"
+            self.failUnless(str(y) == expected)
+
+    def test_length(self):
+        """ Test that if length is increased, we obtain the same results
+        (except longer). """
+
+        for i in range(10):
+            mtype = random.choice(_mtypes)
+            rows = random.randint(1, 20)
+            cols = random.randint(1, 20)
+            fakeseed = random.randint(0, 100)
+            length = random.randint(1, 10)
+            extralength = random.randint(1, 10)
+
+            m = as_sparse(mtype(numpy.random.rand(rows, cols)))
+            o1 = row_random_transformation(m, length, initial_seed=fakeseed)
+            o2 = row_random_transformation(m, length + extralength, initial_seed=fakeseed)
+
+            y1 = compile.eval_outputs([o1])
+            y2 = compile.eval_outputs([o2])
+
+            self.failUnless((y1 == y2[:,:length]).all())
+
+    def test_permute(self):
+        """ Test that if the order of the rows is permuted, we obtain the same results. """
+        for i in range(10):
+            mtype = random.choice(_mtypes)
+            rows = random.randint(2, 20)
+            cols = random.randint(1, 20)
+            fakeseed = random.randint(0, 100)
+            length = random.randint(1, 10)
+
+            permute = numpy.random.permutation(rows)
+
+
+            m1 = numpy.random.rand(rows, cols)
+            m2 = m1[permute]
+            for r in range(rows):
+                self.failUnless((m2[r] == m1[permute[r]]).all())
+            s1 = as_sparse(mtype(m1))
+            s2 = as_sparse(mtype(m2))
+            o1 = row_random_transformation(s1, length, initial_seed=fakeseed)
+            o2 = row_random_transformation(s2, length, initial_seed=fakeseed)
+            y1 = compile.eval_outputs([o1])
+            y2 = compile.eval_outputs([o2])
+
+            self.failUnless(y1.shape == y2.shape)
+            for r in range(rows):
+                self.failUnless((y2[r] == y1[permute[r]]).all())
+
+if __name__ == '__main__':
+    unittest.main()
--- a/common/__init__.py	Thu Jun 19 16:12:29 2008 -0400
+++ b/common/__init__.py	Thu Jun 19 16:18:37 2008 -0400
@@ -1,1 +1,5 @@
-from * import *
+import file
+import floateq
+import memory
+import misc
+import time
--- a/dataset.py	Thu Jun 19 16:12:29 2008 -0400
+++ b/dataset.py	Thu Jun 19 16:18:37 2008 -0400
@@ -1,6 +1,6 @@
 
 from lookup_list import LookupList as Example
-from misc import unique_elements_list_intersection
+from common.misc import unique_elements_list_intersection
 from string import join
 from sys import maxint
 import numpy, copy
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/featuremap.py	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,132 @@
+"""
+Feature mapping.
+
+A feature map is idenfied by a unique name, e.g. "parsing features, experiment 35".
+This unique name also determines the name of the on-disk version of the feature map.
+
+@todo: This should be rewritten to be more Pythonic. Perhaps use a class?
+@todo: Maybe look at older C++ Id/Vocab code? Id could have a __str__ method
+@todo: Clearer documentation.
+@todo: Create an fmap directory
+@todo: Use cPickle, not pickle
+
+@todo: Autosynchronize mode: Each time a new entry is added
+to a L{FeatureMap}, the on-disk version of the feature map is
+updated. Alternately, synchronize to disk when the object is destroyed.
+"""
+
+from common import myopen
+import pickle
+
+# We want this map to be a singleton
+name_to_fmap = {}
+
+def get(name=None, synchronize=True):
+    """
+    Get the L{FeatureMap} for a particular feature name.
+    """
+    global name_to_fmap
+    if name not in name_to_fmap:
+        # Create a new L{FeatureMap}
+        name_to_fmap[name] = FeatureMap(name, synchronize)
+    fmap = name_to_fmap[name]
+    assert fmap.name == name
+    assert fmap.synchronize == synchronize
+    return fmap
+
+def free_memory():
+    """
+    Free the memory associated with all feature maps.
+    """
+    global name_to_fmap
+    name_to_fmap = {}
+
+class KeyError(Exception):
+    """Exception raised for keys missing from a readonly FeatureMap
+    Attributes:
+        name -- Name of the FeatureMap raising the error.
+        key -- Key not present.
+    """
+    def __init__(self, name, key):
+        self.name = name
+        self.key = key
+
+
+class FeatureMap:
+    """
+    Map from a feature string to a numerial ID (starting from 0).
+    
+    If synchronize is False, the feature map is considered temporary
+    and we never actually synchronize it with disk. It expires with the
+    lifetime of this execution.
+
+    @warning: Do not construct this directly. Instead, use the global get() method.
+    @todo: More documentation
+    """
+
+#    name = None
+#    synchronize = True
+#    map = {}
+#    readonly = False        # If True, then each time we look for an ID
+                            # that is not present we throw a ValueError
+    def __init__(self, name=None, synchronize=True):
+        self.name = name
+        self.synchronize = synchronize
+        self.map = {}
+        self.reverse_map = {}
+        self.readonly = False
+
+        # There must be a name provided, or we cannot perform synchronization
+        assert self.name or not self.synchronize
+
+        if self.synchronize:
+            # Try loading map from disk
+            self.load()
+
+    def exists(self, str):
+        """ Return True iff this str is in the map """
+        return str in self.map
+
+    def id(self, str):
+        """ Get the ID for this string. Add a new ID if not is available """
+        """ @todo: Don't want to synchronize every add, this may be too slow. """
+        if str not in self.map:
+            if self.readonly: raise KeyError(self.name, str)
+            l = self.len
+            self.map[str] = l
+            self.reverse_map[l] = str
+            assert l+1 == self.len
+            return l
+        else: return self.map[str]
+
+    def str(self, id):
+        """ Get the string for this ID. """
+        return self.reverse_map[id]
+
+    # This next function should just convert a list to a list
+#    def ids(self, lst):
+#        """ Get the IDs for the elements of a list. Return the ID numbers of these keys as a map. """
+#        idset = {}
+#        for k in lst:
+#            try:
+#                idset[self.id(k)] = True
+#            except KeyError, e:
+#                print "Feature map '%s' does not contain key '%s'. Skipping..." % (e.name, e.key)
+#        return idset
+
+    len = property(lambda self: len(self.map), doc="Number of different feature IDs")
+    filename = property(lambda self: "fmap.%s.pkl.gz" % self.name, doc="The on-disk file synchronized to this feature map.")
+
+    def load(self):
+        """ Load the map from disk. """
+        assert self.synchronize
+        try:
+            f = myopen(self.filename, "rb")
+            (self.map, self.reverse_map) = pickle.load(f)
+        except IOError: print "Could not open %s" % self.filename
+
+    def dump(self):
+        """ Dump the map to disk. """
+        assert self.synchronize
+        f = myopen(self.filename, "wb")
+        pickle.dump((self.map, self.reverse_map), f)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/onehotop.py	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,58 @@
+"""
+One hot Op
+"""
+
+#from theano import tensor
+from theano.tensor import as_tensor, Tensor
+from theano.gof import op
+from theano.gof.graph import Apply
+
+import numpy
+
+class OneHot(op.Op):
+    """
+    Construct a one-hot vector, x out of y.
+
+    @todo: Document inputs and outputs
+    @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64!
+    @todo: Use 'bool' as output dtype, not 'int64' ?
+    @todo: Allow this to operate on column vectors (Tensor)
+    @todo: Describe better.
+    """
+
+    def make_node(self, x, y):
+        """
+        @type x: Vector L{Tensor} of integers
+        @param x: The entries of the one-hot vector to be one.
+        @type y: Integer scalar L{Tensor}
+        @param y: The length (#columns) of the one-hot vectors.
+        @return: A L{Tensor} of one-hot vectors
+
+        @precondition: x < y for all entries of x
+        @todo: Check that x and y are int types
+        """
+        x = as_tensor(x)
+        y = as_tensor(y)
+        #assert x.dtype[0:3] == "int"
+        #assert y.dtype[0:3] == "int"
+        inputs = [x, y]
+        ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])]
+        #outputs = [tensor.Tensor("float64", broadcastable=[False, False])]
+        #outputs = [Tensor("int64", broadcastable=[False, False])]
+        outputs = [Tensor("float64", broadcastable=[False, False]).make_result()]
+        node = Apply(op = self, inputs = inputs, outputs = outputs)
+        return node
+
+    def perform(self, node, (x, y), (out, )):
+        assert x.dtype == "int64" or x.dtype == "int32"
+        assert x.ndim == 1
+        assert y.dtype == "int64" or x.dtype == "int32"
+        assert y.ndim == 0
+        out[0] = numpy.zeros((x.shape[0], y), dtype="float64")
+        for c in range(x.shape[0]):
+            assert x[c] < y
+            out[0][c, x[c]] = 1
+
+    def grad(self, (x, y), (out_gradient, )):
+        return None, None
+one_hot = OneHot()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/onehotop.py.scalar	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,64 @@
+"""
+One hot Op
+"""
+
+#from theano import tensor
+from theano.tensor import as_tensor, Tensor
+#from theano import scalar
+from theano.scalar import as_scalar
+from theano.gof import op
+from theano.gof.graph import Apply
+
+import numpy
+
+class OneHot(op.Op):
+    """
+    Construct a one-hot vector, x out of y.
+
+    @todo: Document inputs and outputs
+    @todo: Use 'bool' as output dtype? Or, at least 'int64' ? Not float64!
+    @todo: Use 'bool' as output dtype, not 'int64' ?
+    @todo: Allow this to operate on column vectors (Tensor)
+    @todo: Describe better.
+    @todo: What type is y?
+    @todo: What about operating on L{Scalar}s?
+    """
+
+    def make_node(self, x, y):
+        """
+        @type x: Vector L{Tensor} of integers
+        @param x: The entries of the one-hot vector to be one.
+        @type y: Integer L{Scalar}
+        @param y: The length (#columns) of the one-hot vectors.
+        @return: A L{Tensor} of one-hot vectors
+
+        @precondition: x < y for all entries of x
+        @todo: Check that x and y are int types
+        """
+        #x = tensor.as_tensor(x)
+        #y = scalar.as_scalar(y)
+        x = as_tensor(x)
+        y = as_scalar(y)
+        #assert x.dtype[0:3] == "int"
+        #assert y.dtype[0:3] == "int"
+        inputs = [x, y]
+        ##outputs = [tensor.Tensor("int64", broadcastable=[False, False])]
+        #outputs = [tensor.Tensor("float64", broadcastable=[False, False])]
+        #outputs = [Tensor("int64", broadcastable=[False, False])]
+        outputs = [Tensor("float64", broadcastable=[False, False]).make_result()]
+        node = Apply(op = self, inputs = inputs, outputs = outputs)
+        return node
+
+    def perform(self, node, (x, y), (out, )):
+        assert x.dtype == "int64"
+        assert type(y) == numpy.int64
+        assert x.ndim == 1
+        #out = numpy.zeros((x.shape[0], y), dtype="int64")
+        out[0] = numpy.zeros((x.shape[0], y), dtype="float64")
+        for c in range(x.shape[0]):
+            assert x[c] < y
+            out[0][c, x[c]] = 1
+
+    def grad(self, (x, y), (out_gradient, )):
+        return None, None
+one_hot = OneHot()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_transformation.py	Thu Jun 19 16:18:37 2008 -0400
@@ -0,0 +1,132 @@
+"""
+New L{Op}s that aren't in core theano
+"""
+
+from theano import sparse
+from theano import tensor
+from theano import scalar
+from theano.gof import op
+
+from theano.sparse import _is_dense, _is_sparse, _is_dense_result, _is_sparse_result
+
+import scipy.sparse
+
+import numpy
+
+class RowRandomTransformation(op.Op):
+    """
+    Given C{x}, a (sparse) matrix with shape (exmpls, dimensions), we
+    multiply it by a deterministic random matrix of shape (dimensions,
+    length) to obtain random transformation output of shape (exmpls,
+    length).
+
+    Each element of the deterministic random matrix is selected uniformly
+    from [-1, +1).
+    @todo: Use another random distribution?
+
+    @note: This function should be written such that if length is
+    increased, we obtain the same results (except longer). Similarly,
+    the rows should be able to be permuted and get the same result in
+    the same fashion.
+
+    @todo: This may be slow?
+    @todo: Rewrite for dense matrices too?
+    @todo: Is there any way to verify the convention that each row is
+    an example? Should I rename the variables in the code to make the
+    semantics more explicit?
+    @todo: AUTOTEST: Autotest that dense and spare versions of this are identical.
+    @todo: Rename? Is Row the correct name? Maybe column-wise?
+
+    @type  x: L{scipy.sparse.spmatrix}
+    @param x: Sparse matrix to be randomly transformed with shape (exmpls, dimensions)
+    @type  length: int
+    @param length: The number of transformations of C{x} to be performed.
+    @param initial_seed: Initial seed for the RNG.
+    @rtype: L{numpy.ndarray}
+    @return: Array with C{length} random transformations, with shape (exmpls, length)
+    """
+
+    import random
+    """
+    RNG used for random transformations.
+    Does not share state with rest of program.
+    @todo: Make STATIC and private. Ask James or Olivier how to make this more Pythonic.
+    """
+    _trng = random.Random()
+
+    def __init__(self, x, length, initial_seed=0, **kwargs):
+        """
+        @todo: Which broadcastable values should I use?
+        """
+        assert 0        # Needs to be updated to Olivier's new Op creation approach
+        op.Op.__init__(self, **kwargs)
+        x = sparse.as_sparse(x)
+        self.initial_seed = initial_seed
+        self.length = length
+        self.inputs = [x]
+        self.outputs = [tensor.Tensor(x.dtype, broadcastable=[False, False])]
+#        self.outputs = [tensor.Tensor(x.dtype, broadcastable=[True, True])]
+
+    def _random_matrix_value(self, row, col, rows):
+        """
+        From a deterministic random matrix, find one element.
+        @param row: The row of the element to be read.
+        @param col: The column of the element to be read.
+        @param row: The number of rows in the matrix.
+        @type row: int
+        @type col: int
+        @type rows: int
+        @note: This function is designed such that if we extend
+        the number of columns in the random matrix, the values of
+        the earlier entries is unchanged.
+        @todo: Make this static
+        """
+        # Choose the random entry at (l, c)
+        rngidx = col * rows + row
+        # Set the random number state for this random entry
+        # Note: This may be slow
+        self._trng.seed(rngidx + self.initial_seed)
+
+        # Determine the value for this entry
+        val = self._trng.uniform(-1, +1)
+#       print "Exmpl #%d, dimension #%d => Random projection #%d has idx %d (+ seed %d) and value %f" % (r, c, j, rngidx, self.initial_seed, val)
+        return val
+
+    def impl(self, xorig):
+        assert _is_sparse(xorig)
+        assert len(xorig.shape) == 2
+        # Since conversions to and from the COO format are quite fast, you
+        # can use this approach to efficiently implement lots computations
+        # on sparse matrices.
+        x = xorig.tocoo()
+        (rows, cols) = x.shape
+        tot = rows * cols
+        out = numpy.zeros((rows, self.length))
+#        print "l = %d" % self.length
+#        print "x.getnnz() = %d" % x.getnnz()
+        all = zip(x.col, x.row, x.data)
+        all.sort()      # TODO: Maybe this is very slow?
+        lastc = None
+        lastl = None
+        lastval = None
+        for l in range(self.length):
+            for (c, r, data) in all:
+                assert c < cols
+                assert r < rows
+                if not c == lastc or not l == lastl:
+                    lastc = c
+                    lastl = l
+                    lastval = self._random_matrix_value(c, l, cols)
+                val = lastval
+#                val = self._random_matrix_value(c, l, cols)
+#                val = self._trng.uniform(-1, +1)
+#                val = 1.0
+                out[r][l] += val * data
+        return out
+    def __copy__(self):
+        return self.__class__(self.inputs[0], self.length, self.initial_seed)
+    def clone_with_new_inputs(self, *new_inputs):
+        return self.__class__(new_inputs[0], self.length, self.initial_seed)
+    def desc(self, *new_inputs):
+        return (self.__class__, self.length, self.initial_seed)
+row_random_transformation = RowRandomTransformation()