Added softmax_all layer.

e7d713cf · Davis King · 11145541 · e7d713cf · e7d713cf · e7d713cf
Commit e7d713cf authored Nov 17, 2017 by Davis King
9 changed files
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -1223,32 +1223,36 @@ namespace dlib
    // -----------------------------------------------------------------------------------
    // -----------------------------------------------------------------------------------

+        namespace ttimpl
+        {
        void softmax (
+            const long num_locations,
+            const long num_channels,
            tensor& dest,
            const tensor& src
        )
        {
+            DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
            DLIB_CASSERT(have_same_dimensions(dest,src));
            const auto d = dest.host();
            const auto s = src.host();

-            const long num = src.nr()*src.nc();
            // Note that we subtract out the max values in each channel before applying
            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
            // doesn't change the resulting output, it just makes it more numerically
            // stable.
            for (long n = 0; n < src.num_samples(); ++n)
            {
-                auto ss = s + num*src.k()*n;
-                auto dd = d + num*src.k()*n;
-                for (long i = 0; i < num; ++i)
+                auto ss = s + num_locations*num_channels*n;
+                auto dd = d + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
                {
                    float max_val = -std::numeric_limits<float>::infinity();
-                    for (long k = 0; k < src.k(); ++k)
-                        max_val = std::max(max_val, ss[k*num]);
+                    for (long k = 0; k < num_channels; ++k)
+                        max_val = std::max(max_val, ss[k*num_locations]);

-                    for (long k = 0; k < src.k(); ++k)
-                        dd[k*num] = std::exp(ss[k*num]-max_val);
+                    for (long k = 0; k < num_channels; ++k)
+                        dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);

                    ++ss;
                    ++dd;
@@ -1258,67 +1262,108 @@ namespace dlib
            // Now normalize each channel so they sum to 1.
            for (long n = 0; n < src.num_samples(); ++n)
            {
-                const auto dd = d + num*src.k()*n;
-                for (long r = 0; r < src.nr(); ++r)
+                const auto dd = d + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
                {
-                    for (long c = 0; c < src.nc(); ++c)
-                    {
-                        const auto ddd = dd+r*src.nc()+c;
+                    const auto ddd = dd+i;

-                        float temp = 0;
-                        for (long k = 0; k < src.k(); ++k)
-                            temp += ddd[k*num];
-                        for (long k = 0; k < src.k(); ++k)
-                            ddd[k*num] /= temp;
-                    }
+                    float temp = 0;
+                    for (long k = 0; k < num_channels; ++k)
+                        temp += ddd[k*num_locations];
+                    for (long k = 0; k < num_channels; ++k)
+                        ddd[k*num_locations] /= temp;
                }
            }
        }

        void softmax_gradient (
+            const long num_locations,
+            const long num_channels,
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
+            DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
            DLIB_CASSERT(have_same_dimensions(grad,dest));
            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
            const auto d = dest.host();
            const auto g = grad.host();
            const auto in = gradient_input.host();

-            const long num = grad.nr()*grad.nc();

            for (long n = 0; n < grad.num_samples(); ++n)
            {
-                const auto d2 = d + num*grad.k()*n;
-                const auto g2 = g + num*grad.k()*n;
-                const auto in2 = in + num*grad.k()*n;
-                for (long r = 0; r < grad.nr(); ++r)
+                const auto d2 = d + num_locations*num_channels*n;
+                const auto g2 = g + num_locations*num_channels*n;
+                const auto in2 = in + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
                {
-                    for (long c = 0; c < grad.nc(); ++c)
+                    const auto d3 = d2+i;
+                    const auto g3 = g2+i;
+                    const auto in3 = in2+i;
+
+                    float temp = 0;
+                    for (long k = 0; k < num_channels; ++k)
+                        temp += -d3[k*num_locations]*in3[k*num_locations];
+                    if (is_same_object(gradient_input, grad))
                    {
-                        const auto d3 = d2+r*grad.nc()+c;
-                        const auto g3 = g2+r*grad.nc()+c;
-                        const auto in3 = in2+r*grad.nc()+c;
-
-                        float temp = 0;
-                        for (long k = 0; k < grad.k(); ++k)
-                            temp += -d3[k*num]*in3[k*num];
-                        if (is_same_object(gradient_input, grad))
-                        {
-                            for (long k = 0; k < grad.k(); ++k)
-                                g3[k*num] = d3[k*num]*(temp+in3[k*num]);
-                        }
-                        else
-                        {
-                            for (long k = 0; k < grad.k(); ++k)
-                                g3[k*num] += d3[k*num]*(temp+in3[k*num]);
-                        }
+                        for (long k = 0; k < num_channels; ++k)
+                            g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+                    }
+                    else
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                            g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
                    }
                }
            }
        }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+        }
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(grad,dest));
+            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+            ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src);
+        }
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(grad,dest));
+            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+            ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input);
+        }

    // ------------------------------------------------------------------------------------


--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -248,6 +248,19 @@ namespace dlib
            const tensor& gradient_input
        );

+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
    // ------------------------------------------------------------------------------------

        void sigmoid (

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -1385,6 +1385,60 @@ namespace dlib
                                      grad.device()));
        }

+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+
+            CHECK_CUDNN(cudnnSoftmaxForward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_INSTANCE,
+                                      &alpha,
+                                      descriptor(src),
+                                      src.device(),
+                                      &beta,
+                                      descriptor(dest),
+                                      dest.device()));
+        }
+
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true );
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_INSTANCE,
+                                      &alpha,
+                                      descriptor(dest),
+                                      dest.device(),
+                                      descriptor(gradient_input),
+                                      gradient_input.device(),
+                                      &beta,
+                                      descriptor(grad),
+                                      grad.device()));
+        }
+
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------


--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -387,6 +387,19 @@ namespace dlib
                  is_same_object(grad, gradient_input)==true
        !*/

+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
    // ------------------------------------------------------------------------------------

        void sigmoid (

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2610,6 +2610,70 @@ namespace dlib
    using softmax = add_layer<softmax_, SUBNET>;

 // ----------------------------------------------------------------------------------------
+
+    class softmax_all_
+    {
+    public:
+        softmax_all_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::softmax_all(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::softmax_all_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const softmax_all_& , std::ostream& out)
+        {
+            serialize("softmax_all_", out);
+        }
+
+        friend void deserialize(softmax_all_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "softmax_all_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_all_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const softmax_all_& )
+        {
+            out << "softmax_all";
+            return out;
+        }
+
+        friend void to_xml(const softmax_all_& /*item*/, std::ostream& out)
+        {
+            out << "<softmax_all/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <typename SUBNET>
+    using softmax_all = add_layer<softmax_all_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
    namespace impl
    {
        template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES>

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -2116,6 +2116,42 @@ namespace dlib
    template <typename SUBNET>
    using softmax = add_layer<softmax_, SUBNET>;

+// ----------------------------------------------------------------------------------------
+
+    class softmax_all_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a softmax layer.  To be precise,
+                we define the softmax function s(x) as:
+                    s(x) == exp(x)/sum(exp(x)) 
+                where x is a vector.  Then this layer treats its input tensor as a
+                collection of tensor::num_samples() vectors and applies s() to each vector
+                in the tensor.  Therefore, there are logically tensor::num_samples()
+                invocations of s().
+        !*/
+
+    public:
+
+        softmax_all_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using softmax_all = add_layer<softmax_all_, SUBNET>;
+
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -741,6 +741,33 @@ namespace dlib { namespace tt
 #endif
    }

+// ----------------------------------------------------------------------------------------
+
+    void softmax_all (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax_all(dest,src);
+#else
+        cpu::softmax_all(dest,src);
+#endif
+    }
+
+    void softmax_all_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax_all_gradient(grad, dest, gradient_input);
+#else
+        cpu::softmax_all_gradient(grad, dest, gradient_input);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    void sigmoid (

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -1216,6 +1216,44 @@ namespace dlib { namespace tt
              is_same_object(grad, gradient_input)==true
    !*/

+// ----------------------------------------------------------------------------------------
+
+    void softmax_all (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - Note that the softmax function is a vector valued function: 
+              s(x) == exp(x)/sum(exp(x)) 
+            - Computes the softmax function on src and writes the results to dest.  The
+              softmax is computed over the entire tensor with one invocation of s().  So
+              unlike softmax() which computes many s() evaluations, one for each spatial
+              location, softmax_all() calls s() once for the entire tensor.
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void softmax_all_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+            - is_same_object(grad, dest)==false
+        ensures
+            - We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor.
+              Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
+              gradient of f() with respect to SRC and assigns it to grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void sigmoid (

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -153,6 +153,68 @@ namespace
        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
        DLIB_TEST(grad_error < 0.001);
+
+#ifdef DLIB_USE_CUDA
+        resizable_tensor src1 = src;
+        resizable_tensor src2 = src;
+        resizable_tensor dest1, dest2;
+        dest1.copy_size(src);
+        dest2.copy_size(src);
+        cuda::softmax_all(dest1, src1);
+        cpu::softmax_all(dest2, src2);
+        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
+#endif
+    }
+
+    void test_softmax_all()
+    {
+        using namespace dlib::tt;
+        print_spinner();
+        const long nr = 3;
+        const long nc = 3;
+        resizable_tensor src(5,5,nr,nr), dest(5,5,nr,nc), gradient_input(5,5,nr,nc);
+        tt::tensor_rand rnd;
+        rnd.fill_uniform(src);
+        rnd.fill_uniform(dest);
+        // fill like this as a test of the assignment operator.
+        gradient_input = matrix_cast<float>(gaussian_randm(5,5*nr*nc, 2));
+
+
+
+        auto grad_src = [&](long idx) {
+            auto f = [&](float eps) {
+                const float old = src.host()[idx];
+                src.host()[idx] += eps;
+                tt::softmax_all(dest, src);
+                float result = dot(gradient_input, dest);
+                src.host()[idx] = old;
+                return result;
+            };
+            const float eps = 0.01;
+            return (f(+eps)-f(-eps))/(2*eps);
+        };
+
+        resizable_tensor src_grad;
+        src_grad.copy_size(src);
+        src_grad = 0;
+
+        tt::softmax_all(dest, src);
+        softmax_all_gradient(src_grad, dest, gradient_input);
+
+        auto grad_error = compare_gradients(src_grad, grad_src);
+        dlog << LINFO << "src error: " << grad_error;
+        DLIB_TEST(grad_error < 0.001);
+
+#ifdef DLIB_USE_CUDA
+        resizable_tensor src1 = src;
+        resizable_tensor src2 = src;
+        resizable_tensor dest1, dest2;
+        dest1.copy_size(src);
+        dest2.copy_size(src);
+        cuda::softmax_all(dest1, src1);
+        cpu::softmax_all(dest2, src2);
+        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
+#endif
    }

    void test_batch_normalize()
@@ -1701,6 +1763,12 @@ namespace
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
+        {
+            print_spinner();
+            softmax_all_ l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
    }

 // ----------------------------------------------------------------------------------------
@@ -2988,6 +3056,7 @@ namespace
            test_avg_pool(4,5,40,50,0,1);
            test_tanh();
            test_softmax();
+            test_softmax_all();
            test_sigmoid();
            test_batch_normalize();
            test_batch_normalize_conv();