# HG changeset patch # User James Bergstra # Date 1219871313 14400 # Node ID 0392b666320a10642a0354c7e9e5b90ba46f99a7 # Parent 23960ee12b52492bf7378b4101c0f4f4fa9adb83 fixed c typos, math error in nnet_ops.py diff -r 23960ee12b52 -r 0392b666320a nnet_ops.py --- a/nnet_ops.py Mon Aug 25 18:15:43 2008 -0400 +++ b/nnet_ops.py Wed Aug 27 17:08:33 2008 -0400 @@ -163,8 +163,6 @@ if (NULL != %(sm)s) Py_XDECREF(%(sm)s); %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s), type_num_%(x)s); if(!%(sm)s) { - // The normal cleanup code will take care of %(nll)s - // Py_XDECREF(%(nll)s); %(nll)s=NULL; PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); %(fail)s } @@ -218,10 +216,6 @@ sm_i[j * Ssm] *= sum_inv; } - if (y_i >= Nx[1]) - { - %(fail)s; - } """ end_row_loop = """ @@ -296,13 +290,13 @@ } } - for (size_t i = 0; i < %(dx)s->dimenstions[0]; ++i) + for (size_t i = 0; i < %(dx)s->dimensions[0]; ++i) { const double* __restrict__ dy_i = (double*) (%(dy)s->data + %(dy)s->strides[0] * i); npy_intp Sdy = %(dy)s->strides[1]/sizeof(double); const double* __restrict__ sm_i = (double*) (%(sm)s->data + %(sm)s->strides[0] * i); npy_intp Ssm = %(sm)s->strides[1]/sizeof(double); - const double* __restrict__ dx_i = (double*) (%(dx)s->data + %(dx)s->strides[0] * i); + double* __restrict__ dx_i = (double*) (%(dx)s->data + %(dx)s->strides[0] * i); npy_intp Sdx = %(dx)s->strides[1]/sizeof(double); double sum_dy_times_sm = 0.; @@ -337,7 +331,6 @@ - softmax(x+b) - argmax(x+b) - softmax(x[i]) is the i'th distribution over len(x[i]) options argmax(x) is the index of x's greatest element y_idx[i] is an integer index, encoding a 1-hot distribution. @@ -374,6 +367,21 @@ am = y_idx.type.make_result() return theano.Apply(self, [x, b, y_idx], [nll, sm, am]) def perform(self, node, input_storage, output_storage): + """ + The math, where x is an input vector, and t is a target index: + + softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j])) + nll(x,t) = -log(softmax(x)[t]) + + We compute this by subtracting off the max of x. This avoids numerical instability. + + m = max_j x[j] + softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m)) + + nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m))) + = -x[t] + m + log( sum_j(exp(x[j] - m))) + + """ x, b, y_idx = input_storage if b.shape[0] != x.shape[1]: raise ValueError('b must have same number of columns as x') @@ -384,11 +392,23 @@ nll = numpy.zeros(x.shape[0]) #nll(y | softmax(x)) am = numpy.zeros_like(y_idx) for i in xrange(sm.shape[0]): - row = x[i] + b + #add the bias vector to the i'th row of x + row = x[i] + b + + #get the maximum value of i'th row for numerically safe softmax / nll am[i] = numpy.argmax(row) - sm[i] = numpy.exp(row - row[am[i]]) #softmax - sm[i] *= 1.0 / numpy.sum(sm[i]) #vector scale - nll[i] = -numpy.log(sm[i, y_idx[i]]) #cross-entropy + m = row[am[i]] + + #compute the unnormalized softmax, and normalization constant + sm[i] = numpy.exp(row - m) + sum_j = numpy.sum(sm[i]) # sum_j(exp(x[j] - m)) + + #normalized our softmax + sm[i] *= 1.0 / sum_j + + # store the nll + nll[i] = -row[y_idx[i]] + m + numpy.log(sum_j) + output_storage[0][0] = nll output_storage[1][0] = sm output_storage[2][0] = am @@ -469,6 +489,7 @@ """ nll_i[0] = - x_i[y_i*Sx] - b_i[y_i*Sb] + + row_max + log(sum); am_i[0] = row_max_j; """,