Commit e7d713cf authored by Davis King's avatar Davis King

Added softmax_all layer.

parent 11145541
......@@ -1223,32 +1223,36 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
namespace ttimpl
{
void softmax (
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src
)
{
DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
DLIB_CASSERT(have_same_dimensions(dest,src));
const auto d = dest.host();
const auto s = src.host();
const long num = src.nr()*src.nc();
// Note that we subtract out the max values in each channel before applying
// exp() to avoid numeric overflow in the subsequent computations. Doing this
// doesn't change the resulting output, it just makes it more numerically
// stable.
for (long n = 0; n < src.num_samples(); ++n)
{
auto ss = s + num*src.k()*n;
auto dd = d + num*src.k()*n;
for (long i = 0; i < num; ++i)
auto ss = s + num_locations*num_channels*n;
auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < src.k(); ++k)
max_val = std::max(max_val, ss[k*num]);
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k*num_locations]);
for (long k = 0; k < src.k(); ++k)
dd[k*num] = std::exp(ss[k*num]-max_val);
for (long k = 0; k < num_channels; ++k)
dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
++ss;
++dd;
......@@ -1258,67 +1262,108 @@ namespace dlib
// Now normalize each channel so they sum to 1.
for (long n = 0; n < src.num_samples(); ++n)
{
const auto dd = d + num*src.k()*n;
for (long r = 0; r < src.nr(); ++r)
const auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
for (long c = 0; c < src.nc(); ++c)
{
const auto ddd = dd+r*src.nc()+c;
const auto ddd = dd+i;
float temp = 0;
for (long k = 0; k < src.k(); ++k)
temp += ddd[k*num];
for (long k = 0; k < src.k(); ++k)
ddd[k*num] /= temp;
}
float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += ddd[k*num_locations];
for (long k = 0; k < num_channels; ++k)
ddd[k*num_locations] /= temp;
}
}
}
void softmax_gradient (
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();
const long num = grad.nr()*grad.nc();
for (long n = 0; n < grad.num_samples(); ++n)
{
const auto d2 = d + num*grad.k()*n;
const auto g2 = g + num*grad.k()*n;
const auto in2 = in + num*grad.k()*n;
for (long r = 0; r < grad.nr(); ++r)
const auto d2 = d + num_locations*num_channels*n;
const auto g2 = g + num_locations*num_channels*n;
const auto in2 = in + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
for (long c = 0; c < grad.nc(); ++c)
const auto d3 = d2+i;
const auto g3 = g2+i;
const auto in3 = in2+i;
float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += -d3[k*num_locations]*in3[k*num_locations];
if (is_same_object(gradient_input, grad))
{
const auto d3 = d2+r*grad.nc()+c;
const auto g3 = g2+r*grad.nc()+c;
const auto in3 = in2+r*grad.nc()+c;
float temp = 0;
for (long k = 0; k < grad.k(); ++k)
temp += -d3[k*num]*in3[k*num];
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < grad.k(); ++k)
g3[k*num] = d3[k*num]*(temp+in3[k*num]);
}
else
{
for (long k = 0; k < grad.k(); ++k)
g3[k*num] += d3[k*num]*(temp+in3[k*num]);
}
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
}
else
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void softmax (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
}
void softmax_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
}
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src);
}
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input);
}
// ------------------------------------------------------------------------------------
......
......@@ -248,6 +248,19 @@ namespace dlib
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void sigmoid (
......
......@@ -1385,6 +1385,60 @@ namespace dlib
grad.device()));
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
if (src.size() == 0)
return;
const float alpha = 1;
const float beta = 0;
CHECK_CUDNN(cudnnSoftmaxForward(context(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
&alpha,
descriptor(src),
src.device(),
&beta,
descriptor(dest),
dest.device()));
}
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_CASSERT(
have_same_dimensions(dest,gradient_input) == true &&
have_same_dimensions(dest,grad) == true );
if (dest.size() == 0)
return;
const float alpha = 1;
const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnSoftmaxBackward(context(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
&alpha,
descriptor(dest),
dest.device(),
descriptor(gradient_input),
gradient_input.device(),
&beta,
descriptor(grad),
grad.device()));
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
......
......@@ -387,6 +387,19 @@ namespace dlib
is_same_object(grad, gradient_input)==true
!*/
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void sigmoid (
......
......@@ -2610,6 +2610,70 @@ namespace dlib
using softmax = add_layer<softmax_, SUBNET>;
// ----------------------------------------------------------------------------------------
class softmax_all_
{
public:
softmax_all_()
{
}
template <typename SUBNET>
void setup (const SUBNET& /*sub*/)
{
}
void forward_inplace(const tensor& input, tensor& output)
{
tt::softmax_all(output, input);
}
void backward_inplace(
const tensor& computed_output,
const tensor& gradient_input,
tensor& data_grad,
tensor&
)
{
tt::softmax_all_gradient(data_grad, computed_output, gradient_input);
}
const tensor& get_layer_params() const { return params; }
tensor& get_layer_params() { return params; }
friend void serialize(const softmax_all_& , std::ostream& out)
{
serialize("softmax_all_", out);
}
friend void deserialize(softmax_all_& , std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "softmax_all_")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_all_.");
}
friend std::ostream& operator<<(std::ostream& out, const softmax_all_& )
{
out << "softmax_all";
return out;
}
friend void to_xml(const softmax_all_& /*item*/, std::ostream& out)
{
out << "<softmax_all/>\n";
}
private:
resizable_tensor params;
};
template <typename SUBNET>
using softmax_all = add_layer<softmax_all_, SUBNET>;
// ----------------------------------------------------------------------------------------
namespace impl
{
template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES>
......
......@@ -2116,6 +2116,42 @@ namespace dlib
template <typename SUBNET>
using softmax = add_layer<softmax_, SUBNET>;
// ----------------------------------------------------------------------------------------
class softmax_all_
{
/*!
WHAT THIS OBJECT REPRESENTS
This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
defined above. In particular, it defines a softmax layer. To be precise,
we define the softmax function s(x) as:
s(x) == exp(x)/sum(exp(x))
where x is a vector. Then this layer treats its input tensor as a
collection of tensor::num_samples() vectors and applies s() to each vector
in the tensor. Therefore, there are logically tensor::num_samples()
invocations of s().
!*/
public:
softmax_all_(
);
template <typename SUBNET> void setup (const SUBNET& sub);
void forward_inplace(const tensor& input, tensor& output);
void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
const tensor& get_layer_params() const;
tensor& get_layer_params();
/*!
These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
interface. Note that this layer doesn't have any parameters, so the tensor
returned by get_layer_params() is always empty.
!*/
};
template <typename SUBNET>
using softmax_all = add_layer<softmax_all_, SUBNET>;
// ----------------------------------------------------------------------------------------
template <
......
......@@ -741,6 +741,33 @@ namespace dlib { namespace tt
#endif
}
// ----------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
)
{
#ifdef DLIB_USE_CUDA
cuda::softmax_all(dest,src);
#else
cpu::softmax_all(dest,src);
#endif
}
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
#ifdef DLIB_USE_CUDA
cuda::softmax_all_gradient(grad, dest, gradient_input);
#else
cpu::softmax_all_gradient(grad, dest, gradient_input);
#endif
}
// ----------------------------------------------------------------------------------------
void sigmoid (
......
......@@ -1216,6 +1216,44 @@ namespace dlib { namespace tt
is_same_object(grad, gradient_input)==true
!*/
// ----------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
/*!
requires
- have_same_dimensions(dest, src) == true
ensures
- Note that the softmax function is a vector valued function:
s(x) == exp(x)/sum(exp(x))
- Computes the softmax function on src and writes the results to dest. The
softmax is computed over the entire tensor with one invocation of s(). So
unlike softmax() which computes many s() evaluations, one for each spatial
location, softmax_all() calls s() once for the entire tensor.
- This function supports in-place operation, i.e. having
is_same_object(dest, src)==true
!*/
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
/*!
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad, dest)==false
ensures
- We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor.
Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
gradient of f() with respect to SRC and assigns it to grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
// ----------------------------------------------------------------------------------------
void sigmoid (
......
......@@ -153,6 +153,68 @@ namespace
auto grad_error = compare_gradients(src_grad, grad_src);
dlog << LINFO << "src error: " << grad_error;
DLIB_TEST(grad_error < 0.001);
#ifdef DLIB_USE_CUDA
resizable_tensor src1 = src;
resizable_tensor src2 = src;
resizable_tensor dest1, dest2;
dest1.copy_size(src);
dest2.copy_size(src);
cuda::softmax_all(dest1, src1);
cpu::softmax_all(dest2, src2);
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
#endif
}
void test_softmax_all()
{
using namespace dlib::tt;
print_spinner();
const long nr = 3;
const long nc = 3;
resizable_tensor src(5,5,nr,nr), dest(5,5,nr,nc), gradient_input(5,5,nr,nc);
tt::tensor_rand rnd;
rnd.fill_uniform(src);
rnd.fill_uniform(dest);
// fill like this as a test of the assignment operator.
gradient_input = matrix_cast<float>(gaussian_randm(5,5*nr*nc, 2));
auto grad_src = [&](long idx) {
auto f = [&](float eps) {
const float old = src.host()[idx];
src.host()[idx] += eps;
tt::softmax_all(dest, src);
float result = dot(gradient_input, dest);
src.host()[idx] = old;
return result;
};
const float eps = 0.01;
return (f(+eps)-f(-eps))/(2*eps);
};
resizable_tensor src_grad;
src_grad.copy_size(src);
src_grad = 0;
tt::softmax_all(dest, src);
softmax_all_gradient(src_grad, dest, gradient_input);
auto grad_error = compare_gradients(src_grad, grad_src);
dlog << LINFO << "src error: " << grad_error;
DLIB_TEST(grad_error < 0.001);
#ifdef DLIB_USE_CUDA
resizable_tensor src1 = src;
resizable_tensor src2 = src;
resizable_tensor dest1, dest2;
dest1.copy_size(src);
dest2.copy_size(src);
cuda::softmax_all(dest1, src1);
cpu::softmax_all(dest2, src2);
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
#endif
}
void test_batch_normalize()
......@@ -1701,6 +1763,12 @@ namespace
auto res = test_layer(l);
DLIB_TEST_MSG(res, res);
}
{
print_spinner();
softmax_all_ l;
auto res = test_layer(l);
DLIB_TEST_MSG(res, res);
}
}
// ----------------------------------------------------------------------------------------
......@@ -2988,6 +3056,7 @@ namespace
test_avg_pool(4,5,40,50,0,1);
test_tanh();
test_softmax();
test_softmax_all();
test_sigmoid();
test_batch_normalize();
test_batch_normalize_conv();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment