Mercurial > pylearn

--- a/nnet_ops.py	Mon Aug 25 18:15:43 2008 -0400
+++ b/nnet_ops.py	Wed Aug 27 17:08:33 2008 -0400
@@ -163,8 +163,6 @@
             if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
             %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s), type_num_%(x)s);
             if(!%(sm)s) {
-                // The normal cleanup code will take care of %(nll)s
-                // Py_XDECREF(%(nll)s); %(nll)s=NULL;
                 PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
                 %(fail)s
             }
@@ -218,10 +216,6 @@
                 sm_i[j * Ssm] *= sum_inv;
             }

-            if (y_i >= Nx[1])
-            {
-                %(fail)s;
-            }
         """

         end_row_loop = """
@@ -296,13 +290,13 @@
             }
         }

-        for (size_t i = 0; i < %(dx)s->dimenstions[0]; ++i)
+        for (size_t i = 0; i < %(dx)s->dimensions[0]; ++i)
         {
             const double* __restrict__ dy_i = (double*) (%(dy)s->data + %(dy)s->strides[0] * i);
             npy_intp Sdy = %(dy)s->strides[1]/sizeof(double);
             const double* __restrict__ sm_i = (double*) (%(sm)s->data + %(sm)s->strides[0] * i);
             npy_intp Ssm = %(sm)s->strides[1]/sizeof(double);
-            const double* __restrict__ dx_i = (double*) (%(dx)s->data + %(dx)s->strides[0] * i);
+            double* __restrict__ dx_i = (double*) (%(dx)s->data + %(dx)s->strides[0] * i);
             npy_intp Sdx = %(dx)s->strides[1]/sizeof(double);

             double sum_dy_times_sm = 0.;
@@ -337,7 +331,6 @@
      - softmax(x+b)
      - argmax(x+b)

-
     softmax(x[i]) is the i'th distribution over len(x[i]) options
     argmax(x) is the index of x's greatest element
     y_idx[i] is an integer index, encoding a 1-hot distribution.
@@ -374,6 +367,21 @@
         am = y_idx.type.make_result()
         return theano.Apply(self, [x, b, y_idx], [nll, sm, am])
     def perform(self, node, input_storage, output_storage):
+        """
+        The math, where x is an input vector, and t is a target index:
+
+            softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j]))
+            nll(x,t) = -log(softmax(x)[t])
+
+        We compute this by subtracting off the max of x. This avoids numerical instability.
+
+            m = max_j x[j]
+            softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m))
+
+            nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m)))
+                = -x[t] + m + log( sum_j(exp(x[j] - m)))
+
+        """
         x, b, y_idx = input_storage
         if b.shape[0] != x.shape[1]:
             raise ValueError('b must have same number of columns as x')
@@ -384,11 +392,23 @@
         nll = numpy.zeros(x.shape[0]) #nll(y | softmax(x))
         am = numpy.zeros_like(y_idx)
         for i in xrange(sm.shape[0]):
-            row = x[i] + b
+            #add the bias vector to the i'th row of x
+            row = x[i] + b
+
+            #get the maximum value of i'th row for numerically safe softmax / nll
             am[i] = numpy.argmax(row)
-            sm[i] = numpy.exp(row - row[am[i]]) #softmax
-            sm[i] *= 1.0 / numpy.sum(sm[i]) #vector scale
-            nll[i] = -numpy.log(sm[i, y_idx[i]]) #cross-entropy
+            m = row[am[i]]
+
+            #compute the unnormalized softmax, and normalization constant
+            sm[i] = numpy.exp(row - m)
+            sum_j = numpy.sum(sm[i]) # sum_j(exp(x[j] - m))
+
+            #normalized our softmax
+            sm[i] *= 1.0 / sum_j
+
+            # store the nll
+            nll[i] = -row[y_idx[i]] + m + numpy.log(sum_j)
+
         output_storage[0][0] = nll
         output_storage[1][0] = sm
         output_storage[2][0] = am
@@ -469,6 +489,7 @@
                 """
             nll_i[0] = - x_i[y_i*Sx]
                        - b_i[y_i*Sb]
+                       + row_max
                        + log(sum);
             am_i[0] = row_max_j;
                 """,