Fixed the in-place layers so that they don't interfere with the operation of

skip layers and add_prev style layers. In particular, now in-place layers only overwrite the gradient information in their child layer if they are operating in in-place mode. Otherwise, they add their gradients to their child layers. It should also be noted that it's safe for in-place layers to overwrite gradients when in in-place mode since their child layers are inaccessible when in-place layers operate in in-place mode. This prevents any other layers from trying to add to the child layer, thereby avoiding the potability of layer interference. So the bug this change fixes is that, when not in in-place mode the child layers are still accessible but in-place layers were *still* overwriting child gradients.

Fixed the in-place layers so that they don't interfere with the operation of
skip layers and add_prev style layers. In particular, now in-place layers only overwrite the gradient information in their child layer if they are operating in in-place mode. Otherwise, they add their gradients to their child layers. It should also be noted that it's safe for in-place layers to overwrite gradients when in in-place mode since their child layers are inaccessible when in-place layers operate in in-place mode. This prevents any other layers from trying to add to the child layer, thereby avoiding the potability of layer interference. So the bug this change fixes is that, when not in in-place mode the child layers are still accessible but in-place layers were *still* overwriting child gradients.
8421f213 · Davis King · 81eb18a4 · 8421f213 · 8421f213 · 8421f213
Commit 8421f213 authored May 14, 2016 by Davis King
11 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -2849,8 +2849,11 @@ namespace dlib
                resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
                params_grad1 = params_grad;
                params_grad2 = params_grad;
-                // Now call backward() and make sure it works as well.
-                subnetwork2.get_gradient_input() = 9999;
+                // Now call backward() and make sure it works as well.  Recall that when an
+                // in-place layer works in-place it assigns to it's outputs but when it's
+                // not running in-place it adds.  So we initialize to a non-zero value to
+                // check that this is the behavior that really executes.
+                subnetwork2.get_gradient_input() = 9;
                impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
                data_grad1 = subnetwork2.get_gradient_input();

@@ -2868,7 +2871,7 @@ namespace dlib
                        return layer_test_results(sout.str()); 
                    }
                }
-                const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
+                const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2)));
                if (backward_data_error > 0.00001)
                {
                    using namespace std;
@@ -2934,8 +2937,7 @@ namespace dlib
                // compare it to the one output by the layer and make sure they match.
                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
                double output_derivative = subnetwork.get_gradient_input_element(i);
-                if (!impl::is_inplace_layer(l,subnetwork))
-                    output_derivative -= initial_gradient_input[i];
+                output_derivative -= initial_gradient_input[i];
                double relative_error;
                if (reference_derivative != 0)
                    relative_error = (reference_derivative - output_derivative)/(reference_derivative);

--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -16,6 +16,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void multiply (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
@@ -38,24 +39,44 @@ namespace dlib
            const auto s2 = src2.host();
            if (dest.size() == src1.size() && src1.size() == src2.size())
            {
-                for (size_t i = 0; i < src1.size(); ++i)
-                    d[i] = s1[i]*s2[i];
+                if (add_to)
+                {
+                    for (size_t i = 0; i < src1.size(); ++i)
+                        d[i] += s1[i]*s2[i];
+                }
+                else
+                {
+                    for (size_t i = 0; i < src1.size(); ++i)
+                        d[i] = s1[i]*s2[i];
+                }
            }
            else if (dest.num_samples() == 1)
            {
-                for (size_t i = 0; i < dest.size(); ++i)
-                    d[i] = 0;
+                if (!add_to)
+                {
+                    for (size_t i = 0; i < dest.size(); ++i)
+                        d[i] = 0;
+                }
                for (size_t i = 0; i < max_size; ++i)
                    d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
            }
            else
            {
-                for (size_t i = 0; i < max_size; ++i)
-                    d[i] = s1[i%src1.size()]*s2[i%src2.size()];
+                if (add_to)
+                {
+                    for (size_t i = 0; i < max_size; ++i)
+                        d[i] += s1[i%src1.size()]*s2[i%src2.size()];
+                }
+                else
+                {
+                    for (size_t i = 0; i < max_size; ++i)
+                        d[i] = s1[i%src1.size()]*s2[i%src2.size()];
+                }
            }
        }

        void multiply_conv (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
@@ -68,15 +89,34 @@ namespace dlib
            {
                DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");

-                for (long n = 0; n < dest.num_samples(); ++n)
+                if (add_to)
                {
-                    for (long k = 0; k < dest.k(); ++k)
+                    for (long n = 0; n < dest.num_samples(); ++n)
                    {
-                        for (long r = 0; r < dest.nr(); ++r)
+                        for (long k = 0; k < dest.k(); ++k)
                        {
-                            for (long c = 0; c < dest.nc(); ++c)
+                            for (long r = 0; r < dest.nr(); ++r)
+                            {
+                                for (long c = 0; c < dest.nc(); ++c)
+                                {
+                                    *d++ += (*s1++)*s2[k];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for (long n = 0; n < dest.num_samples(); ++n)
+                    {
+                        for (long k = 0; k < dest.k(); ++k)
+                        {
+                            for (long r = 0; r < dest.nr(); ++r)
                            {
-                                *d++ = (*s1++)*s2[k];
+                                for (long c = 0; c < dest.nc(); ++c)
+                                {
+                                    *d++ = (*s1++)*s2[k];
+                                }
                            }
                        }
                    }
@@ -87,8 +127,11 @@ namespace dlib
                DLIB_CASSERT(have_same_dimensions(src1,src2),"");
                DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");

-                for (long k = 0; k < src1.k(); ++k)
-                    d[k] = 0;
+                if (!add_to)
+                {
+                    for (long k = 0; k < src1.k(); ++k)
+                        d[k] = 0;
+                }

                for (long n = 0; n < src1.num_samples(); ++n)
                {
@@ -1105,8 +1148,16 @@ namespace dlib
                        float temp = 0;
                        for (long k = 0; k < grad.k(); ++k)
                            temp += -d3[k*num]*in3[k*num];
-                        for (long k = 0; k < grad.k(); ++k)
-                            g3[k*num] = d3[k*num]*(temp+in3[k*num]);
+                        if (is_same_object(gradient_input, grad))
+                        {
+                            for (long k = 0; k < grad.k(); ++k)
+                                g3[k*num] = d3[k*num]*(temp+in3[k*num]);
+                        }
+                        else
+                        {
+                            for (long k = 0; k < grad.k(); ++k)
+                                g3[k*num] += d3[k*num]*(temp+in3[k*num]);
+                        }
                    }
                }
            }
@@ -1134,8 +1185,16 @@ namespace dlib
            const auto g = grad.host();
            const auto d = dest.host();
            const auto in = gradient_input.host();
-            for (size_t i = 0; i < dest.size(); ++i)
-                g[i] = in[i]*d[i]*(1-d[i]);
+            if (is_same_object(gradient_input, grad))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] = in[i]*d[i]*(1-d[i]);
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] += in[i]*d[i]*(1-d[i]);
+            }
        }

    // ------------------------------------------------------------------------------------
@@ -1157,12 +1216,23 @@ namespace dlib
            const float* gi = gradient_input.host();
            const float* in = dest.host();
            float* out = grad.host();
-            for (size_t i = 0; i < dest.size(); ++i)
+            if (is_same_object(grad, gradient_input))
            {
-                if (in[i] > 0)
-                    out[i] = gi[i];
-                else
-                    out[i] = 0;
+                for (size_t i = 0; i < dest.size(); ++i)
+                {
+                    if (in[i] > 0)
+                        out[i] = gi[i];
+                    else
+                        out[i] = 0;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                {
+                    if (in[i] > 0)
+                        out[i] += gi[i];
+                }
            }
        }

@@ -1194,6 +1264,7 @@ namespace dlib
            tensor& params_grad 
        )
        {
+            DLIB_CASSERT(is_same_object(grad, gradient_input) == false,"");
            const float p = param.host()[0];
            const float* gi = gradient_input.host();
            const float* s = src.host();
@@ -1236,8 +1307,16 @@ namespace dlib
            const auto g = grad.host();
            const auto d = dest.host();
            const auto in = gradient_input.host();
-            for (size_t i = 0; i < dest.size(); ++i)
-                g[i] = in[i]*(1-d[i]*d[i]);
+            if (is_same_object(grad, gradient_input))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] = in[i]*(1-d[i]*d[i]);
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] += in[i]*(1-d[i]*d[i]);
+            }
        }

    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -16,12 +16,14 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void multiply (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
        );

        void multiply_conv (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -122,7 +122,34 @@ namespace dlib
            }
        }

+        __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += s1[i]*s2[i];
+            }
+        }
+        __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                for (size_t j = i; j < max_size; j += n)
+                    d[i] += s1[j%s1_n]*s2[j%s2_n];
+            }
+        }
+
+        __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += s1[i%s1_n]*s2[i%s2_n];
+            }
+        }
+
        void multiply (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
@@ -146,17 +173,28 @@ namespace dlib
            const auto s2 = src2.host();
            if (dest.size() == src1.size() && src1.size() == src2.size())
            {
-                launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+                if (add_to)
+                    launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+                else
+                    launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
            }
            else if (dest.num_samples() == 1)
            {
-                launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
-                                             dest.size(), src1.size(), src2.size(), max_size);
+                if (add_to)
+                    launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size(), max_size);
+                else
+                    launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size(), max_size);
            }
            else
            {
-                launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
-                                             dest.size(), src1.size(), src2.size());
+                if (add_to)
+                    launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size());
+                else
+                    launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size());
            }
        }

@@ -191,8 +229,33 @@ namespace dlib
            }
        }

+        __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] += s1[i]*s2[k];
+            }
+        }
+
+        __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            // loop over all the image planes
+            for (auto i : grid_stride_range_y(0, n))
+            {
+                // sum all the elements in the i-th image plane
+                float temp = 0;
+                for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+                    temp += s1[j]*s2[j];
+                auto k = i%ks;
+                // and store the sum into d[k]
+                warp_reduce_atomic_add(d[k], temp);
+            }
+        }
+

        void multiply_conv (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
@@ -204,8 +267,12 @@ namespace dlib
                if (dest.size() == 0)
                    return;

-                launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
-                    dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                if (add_to)
+                    launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
+                        dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                else
+                    launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
+                        dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
            }
            else
            {
@@ -216,8 +283,12 @@ namespace dlib

                dim3 blocks(10,1);
                dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
-                _cuda_multiply_conv2<<<blocks,threads>>>(
-                    dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                if (add_to)
+                    _cuda_multiply_conv2_add_to<<<blocks,threads>>>(
+                        dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                else
+                    _cuda_multiply_conv2<<<blocks,threads>>>(
+                        dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
            }

        }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -102,12 +102,14 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void multiply (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2
        );

        void multiply_conv (
+            bool add_to,
            tensor& dest,
            const tensor& src1,
            const tensor& src2

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -1265,7 +1265,7 @@ namespace dlib
                return;

            const float alpha = 1;
-            const float beta = 0;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
            CHECK_CUDNN(cudnnSoftmaxBackward(context(),
                                      CUDNN_SOFTMAX_ACCURATE,
                                      CUDNN_SOFTMAX_MODE_CHANNEL,
@@ -1316,7 +1316,7 @@ namespace dlib
                return;

            const float alpha = 1;
-            const float beta = 0;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
            CHECK_CUDNN(cudnnActivationBackward(context(),
                                          sigmoid_activation_descriptor(),
                                          &alpha,
@@ -1367,7 +1367,7 @@ namespace dlib
                return;

            const float alpha = 1;
-            const float beta = 0;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
            CHECK_CUDNN(cudnnActivationBackward(context(),
                                          relu_activation_descriptor(),
                                          &alpha,
@@ -1418,7 +1418,7 @@ namespace dlib
                return;

            const float alpha = 1;
-            const float beta = 0;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
            CHECK_CUDNN(cudnnActivationBackward(context(),
                                          tanh_activation_descriptor(),
                                          &alpha,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -966,7 +966,7 @@ namespace dlib
            mask.copy_size(input);
            rnd.fill_uniform(mask);
            tt::threshold(mask, drop_rate);
-            tt::multiply(output, input, mask);
+            tt::multiply(false, output, input, mask);
        } 

        void backward_inplace(
@@ -975,7 +975,10 @@ namespace dlib
            tensor& /*params_grad*/
        )
        {
-            tt::multiply(data_grad, mask, gradient_input);
+            if (is_same_object(gradient_input, data_grad))
+                tt::multiply(false, data_grad, mask, gradient_input);
+            else
+                tt::multiply(true, data_grad, mask, gradient_input);
        }

        const tensor& get_layer_params() const { return params; }
@@ -1044,7 +1047,7 @@ namespace dlib

        void forward_inplace(const tensor& input, tensor& output)
        {
-            tt::affine_transform(output, input, val, 0);
+            tt::affine_transform(output, input, val);
        } 

        void backward_inplace(
@@ -1053,7 +1056,10 @@ namespace dlib
            tensor& /*params_grad*/
        )
        {
-            tt::affine_transform(data_grad, gradient_input, val, 0);
+            if (is_same_object(gradient_input, data_grad))
+                tt::affine_transform(data_grad, gradient_input, val);
+            else
+                tt::affine_transform(data_grad, data_grad, gradient_input, 1, val);
        }

        const tensor& get_layer_params() const { return params; }
@@ -1187,11 +1193,17 @@ namespace dlib
            // We are computing the gradient of dot(gradient_input, computed_output*g + b)
            if (mode == FC_MODE)
            {
-                tt::multiply(data_grad, gradient_input, g);
+                if (is_same_object(gradient_input, data_grad))
+                    tt::multiply(false, data_grad, gradient_input, g);
+                else
+                    tt::multiply(true, data_grad, gradient_input, g);
            }
            else
            {
-                tt::multiply_conv(data_grad, gradient_input, g);
+                if (is_same_object(gradient_input, data_grad))
+                    tt::multiply_conv(false, data_grad, gradient_input, g);
+                else
+                    tt::multiply_conv(true, data_grad, gradient_input, g);
            }
        }


--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -287,7 +287,10 @@ namespace dlib
                      to data_input. 
                  Finally, backward_inplace() outputs these gradients by performing:
                    - params_grad = PARAMETER_GRADIENT 
-                    - data_grad = DATA_GRADIENT
+                    - if (is_same_object(gradient_input, data_grad)) then
+                        - data_grad = DATA_GRADIENT
+                    - else
+                        - data_grad += DATA_GRADIENT
        !*/

        const tensor& get_layer_params(

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -127,6 +127,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void multiply (
+        bool add_to,
        tensor& dest,
        const tensor& src1,
        const tensor& src2
@@ -140,23 +141,24 @@ namespace dlib { namespace tt
                    (src1.num_samples()==1 || src1.num_samples()==MD) &&
                    (src2.num_samples()==1 || src2.num_samples()==MD) ,"");
 #ifdef DLIB_USE_CUDA
-        cuda::multiply(dest, src1, src2);
+        cuda::multiply(add_to, dest, src1, src2);
 #else
-        cpu::multiply(dest, src1, src2);
+        cpu::multiply(add_to, dest, src1, src2);
 #endif

    }

    void multiply_conv (
+        bool add_to,
        tensor& dest,
        const tensor& src1,
        const tensor& src2
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::multiply_conv(dest, src1, src2);
+        cuda::multiply_conv(add_to, dest, src1, src2);
 #else
-        cpu::multiply_conv(dest, src1, src2);
+        cpu::multiply_conv(add_to, dest, src1, src2);
 #endif
    }


--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -101,6 +101,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void multiply (
+        bool add_to,
        tensor& dest,
        const tensor& src1,
        const tensor& src2
@@ -124,9 +125,12 @@ namespace dlib { namespace tt
              Second, if dest.num_samples()==1, then after the pointwise multiplication of
              src1 with src2, the result has its samples summed to produce an output tensor
              with num_samples()==1 which is then assigned to #dest.
+            - if (add_to) then
+                - Instead of assigning the result to dest, this function adds the result to dest.
    !*/

    void multiply_conv (
+        bool add_to,
        tensor& dest,
        const tensor& src1,
        const tensor& src2
@@ -152,6 +156,8 @@ namespace dlib { namespace tt
                    #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
                - else
                    #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
+            - if (add_to) then
+                - Instead of assigning the result to dest, this function adds the result to dest.
    !*/

 // ----------------------------------------------------------------------------------------
@@ -865,11 +871,13 @@ namespace dlib { namespace tt
        requires
            - have_same_dimensions(dest,gradient_input) == true 
            - have_same_dimensions(dest,grad) == true 
-            - is_same_object(grad, dest)==false
        ensures
            - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
-              Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
-              gradient of f() with respect to SRC and adds it to grad.
+              Then let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
            - This function supports in-place operation, i.e. having
              is_same_object(grad, gradient_input)==true
    !*/
@@ -899,12 +907,13 @@ namespace dlib { namespace tt
        requires
            - have_same_dimensions(dest,gradient_input) == true 
            - have_same_dimensions(dest,grad) == true 
-            - is_same_object(grad,dest) == false
        ensures
            - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest)
-            - Then this function computes the gradient of f() with respect to SRC and
-              assigns it to grad.
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
            - This function supports in-place operation, i.e. having
              is_same_object(grad, gradient_input)==true
    !*/
@@ -934,12 +943,13 @@ namespace dlib { namespace tt
        requires
            - have_same_dimensions(dest,gradient_input) == true 
            - have_same_dimensions(dest,grad) == true 
-            - is_same_object(grad,dest) == false
        ensures
            - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest)
-            - Then this function computes the gradient of f() with respect to SRC and
-              assigns it to grad.
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
            - This function supports in-place operation, i.e. having
              is_same_object(grad, gradient_input)==true
    !*/
@@ -978,6 +988,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(grad,gradient_input) == true 
            - param.size() == 1
            - params_grad.size() == 1
+            - is_same_object(grad, gradient_input) == false
        ensures
            - Recalling that dest is the output of prelu(dest,src,param) let 
              f(src,param) == dot(gradient_input,dest)
@@ -1011,12 +1022,13 @@ namespace dlib { namespace tt
        requires
            - have_same_dimensions(dest,gradient_input) == true 
            - have_same_dimensions(dest,grad) == true 
-            - is_same_object(grad,dest) == false
        ensures
            - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest)
-            - Then this function computes the gradient of f() with respect to SRC and
-              assigns it to grad.
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
            - This function supports in-place operation, i.e. having
              is_same_object(grad, gradient_input)==true
    !*/

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -362,7 +362,10 @@ namespace
        DLIB_TEST(max(abs(truth3-mat(dest))) < 1e-5);

        matrix<float> truth4 = pointwise_multiply(mat(A), mat(B));
-        tt::multiply(A, A, B);
+        tt::multiply(false, A, A, B);
+        DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
+        truth4 = pointwise_multiply(mat(A), mat(B)) + mat(A);
+        tt::multiply(true, A, A, B);
        DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);

        matrix<float> truth5 = mat(B) > 0.1;
@@ -418,25 +421,34 @@ namespace

            dest.set_size(1,4);

-            tt::multiply(dest, A, B);
+            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6); 

            A.set_size(1,4);
            rnd.fill_uniform(A);
            matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);

-            tt::multiply(dest, A, B);
+            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

-            tt::multiply(dest, B, A);
+            tt::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 
+            matrix<float> prevdest = mat(dest);
+            tt::multiply(true, dest, B, A);
+            DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            dest.set_size(3,4);
-            tt::multiply(dest, B, A);
+            tt::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
+            prevdest = mat(dest);
+            tt::multiply(true, dest, B, A);
+            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 

-            tt::multiply(dest, A, B);
+            tt::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
+            prevdest = mat(dest);
+            tt::multiply(true, dest, B, A);
+            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 
        }

        {
@@ -731,8 +743,11 @@ namespace
        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
        dest2 = dest; src2 = src;
-        cuda::multiply(dest, dest, src);
-        cpu::multiply(dest2, dest2, src2);
+        cuda::multiply(false, dest, dest, src);
+        cpu::multiply(false, dest2, dest2, src2);
+        DLIB_TEST(equal(mat(dest),mat(dest2)));
+        cuda::multiply(true, dest, dest, src);
+        cpu::multiply(true, dest2, dest2, src2);
        DLIB_TEST(equal(mat(dest),mat(dest2)));


@@ -801,24 +816,30 @@ namespace

            dest.set_size(1,4);

-            cuda::multiply(dest, A, B);
+            cuda::multiply(false, dest, A, B);
            DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B)))))); 

            A.set_size(1,4);
            rnd.fill_uniform(A);
            matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);

-            cuda::multiply(dest, A, B);
+            cuda::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

-            cuda::multiply(dest, B, A);
+            cuda::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 
+            matrix<float> prevdest = mat(dest);
+            cuda::multiply(true, dest, B, A);
+            DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); 

            dest.set_size(3,4);
-            cuda::multiply(dest, B, A);
+            cuda::multiply(false, dest, B, A);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
+            prevdest = mat(dest);
+            cuda::multiply(true, dest, B, A);
+            DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6); 

-            cuda::multiply(dest, A, B);
+            cuda::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
        }
    }
@@ -955,8 +976,11 @@ namespace
            trand.fill_uniform(src1);
            trand.fill_uniform(src2);

-            cpu::multiply_conv(dest1, src1, src2);
-            cuda::multiply_conv(dest2, src1, src2);
+            cpu::multiply_conv(false, dest1, src1, src2);
+            cuda::multiply_conv(false, dest2, src1, src2);
+            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
+            cpu::multiply_conv(true, dest1, src1, src2);
+            cuda::multiply_conv(true, dest2, src1, src2);
            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);


@@ -968,12 +992,19 @@ namespace
            trand.fill_uniform(dest2);
            trand.fill_uniform(src1);
            trand.fill_uniform(src2);
-            cpu::multiply_conv(dest1, src1, src2);
-            cuda::multiply_conv(dest2, src1, src2);
-            const float scale = max(abs(mat(dest1)));
-            const float scalem = mean(abs(mat(dest1)));
+            cpu::multiply_conv(false, dest1, src1, src2);
+            cuda::multiply_conv(false, dest2, src1, src2);
+            float scale = max(abs(mat(dest1)));
+            float scalem = mean(abs(mat(dest1)));
            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
+            matrix<float> prevd2 = mat(dest2);
+            cpu::multiply_conv(false, dest1, src1, src2);
+            cuda::multiply_conv(true, dest2, src1, src2);
+            scale = max(abs(mat(dest1)));
+            scalem = mean(abs(mat(dest1)));
+            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)+prevd2))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)+prevd2))/scale);
+            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem);
        }

        for (int iter = 0; iter < 100; ++iter)